]> bbs.cooldavid.org Git - net-next-2.6.git/blame - drivers/block/drbd/drbd_main.c
drbd: Removed the BIO_RW_BARRIER support form the receiver/epoch code
[net-next-2.6.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
e9e6f3ec 80static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
b411b363 81
b411b363
PR
82MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
83 "Lars Ellenberg <lars@linbit.com>");
84MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
85MODULE_VERSION(REL_VERSION);
86MODULE_LICENSE("GPL");
87MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
88MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
89
90#include <linux/moduleparam.h>
91/* allow_open_on_secondary */
92MODULE_PARM_DESC(allow_oos, "DONT USE!");
93/* thanks to these macros, if compiled into the kernel (not-module),
94 * this becomes the boot parameter drbd.minor_count */
95module_param(minor_count, uint, 0444);
96module_param(disable_sendpage, bool, 0644);
97module_param(allow_oos, bool, 0);
98module_param(cn_idx, uint, 0444);
99module_param(proc_details, int, 0644);
100
101#ifdef CONFIG_DRBD_FAULT_INJECTION
102int enable_faults;
103int fault_rate;
104static int fault_count;
105int fault_devs;
106/* bitmap of enabled faults */
107module_param(enable_faults, int, 0664);
108/* fault rate % value - applies to all enabled faults */
109module_param(fault_rate, int, 0664);
110/* count of faults inserted */
111module_param(fault_count, int, 0664);
112/* bitmap of devices to insert faults on */
113module_param(fault_devs, int, 0644);
114#endif
115
116/* module parameter, defined */
117unsigned int minor_count = 32;
118int disable_sendpage;
119int allow_oos;
120unsigned int cn_idx = CN_IDX_DRBD;
121int proc_details; /* Detail level in proc drbd*/
122
123/* Module parameter for setting the user mode helper program
124 * to run. Default is /sbin/drbdadm */
125char usermode_helper[80] = "/sbin/drbdadm";
126
127module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
128
129/* in 2.6.x, our device mapping and config info contains our virtual gendisks
130 * as member "struct gendisk *vdisk;"
131 */
132struct drbd_conf **minor_table;
133
134struct kmem_cache *drbd_request_cache;
135struct kmem_cache *drbd_ee_cache; /* epoch entries */
136struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
137struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
138mempool_t *drbd_request_mempool;
139mempool_t *drbd_ee_mempool;
140
141/* I do not use a standard mempool, because:
142 1) I want to hand out the pre-allocated objects first.
143 2) I want to be able to interrupt sleeping allocation with a signal.
144 Note: This is a single linked list, the next pointer is the private
145 member of struct page.
146 */
147struct page *drbd_pp_pool;
148spinlock_t drbd_pp_lock;
149int drbd_pp_vacant;
150wait_queue_head_t drbd_pp_wait;
151
152DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
153
7d4e9d09 154static const struct block_device_operations drbd_ops = {
b411b363
PR
155 .owner = THIS_MODULE,
156 .open = drbd_open,
157 .release = drbd_release,
158};
159
160#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
161
162#ifdef __CHECKER__
163/* When checking with sparse, and this is an inline function, sparse will
164 give tons of false positives. When this is a real functions sparse works.
165 */
166int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
167{
168 int io_allowed;
169
170 atomic_inc(&mdev->local_cnt);
171 io_allowed = (mdev->state.disk >= mins);
172 if (!io_allowed) {
173 if (atomic_dec_and_test(&mdev->local_cnt))
174 wake_up(&mdev->misc_wait);
175 }
176 return io_allowed;
177}
178
179#endif
180
181/**
182 * DOC: The transfer log
183 *
184 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
185 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
186 * of the list. There is always at least one &struct drbd_tl_epoch object.
187 *
188 * Each &struct drbd_tl_epoch has a circular double linked list of requests
189 * attached.
190 */
191static int tl_init(struct drbd_conf *mdev)
192{
193 struct drbd_tl_epoch *b;
194
195 /* during device minor initialization, we may well use GFP_KERNEL */
196 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
197 if (!b)
198 return 0;
199 INIT_LIST_HEAD(&b->requests);
200 INIT_LIST_HEAD(&b->w.list);
201 b->next = NULL;
202 b->br_number = 4711;
7e602c0a 203 b->n_writes = 0;
b411b363
PR
204 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
205
206 mdev->oldest_tle = b;
207 mdev->newest_tle = b;
208 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
209
210 mdev->tl_hash = NULL;
211 mdev->tl_hash_s = 0;
212
213 return 1;
214}
215
216static void tl_cleanup(struct drbd_conf *mdev)
217{
218 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
219 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
220 kfree(mdev->oldest_tle);
221 mdev->oldest_tle = NULL;
222 kfree(mdev->unused_spare_tle);
223 mdev->unused_spare_tle = NULL;
224 kfree(mdev->tl_hash);
225 mdev->tl_hash = NULL;
226 mdev->tl_hash_s = 0;
227}
228
229/**
230 * _tl_add_barrier() - Adds a barrier to the transfer log
231 * @mdev: DRBD device.
232 * @new: Barrier to be added before the current head of the TL.
233 *
234 * The caller must hold the req_lock.
235 */
236void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
237{
238 struct drbd_tl_epoch *newest_before;
239
240 INIT_LIST_HEAD(&new->requests);
241 INIT_LIST_HEAD(&new->w.list);
242 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
243 new->next = NULL;
7e602c0a 244 new->n_writes = 0;
b411b363
PR
245
246 newest_before = mdev->newest_tle;
247 /* never send a barrier number == 0, because that is special-cased
248 * when using TCQ for our write ordering code */
249 new->br_number = (newest_before->br_number+1) ?: 1;
250 if (mdev->newest_tle != new) {
251 mdev->newest_tle->next = new;
252 mdev->newest_tle = new;
253 }
254}
255
256/**
257 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
258 * @mdev: DRBD device.
259 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
260 * @set_size: Expected number of requests before that barrier.
261 *
262 * In case the passed barrier_nr or set_size does not match the oldest
263 * &struct drbd_tl_epoch objects this function will cause a termination
264 * of the connection.
265 */
266void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
267 unsigned int set_size)
268{
269 struct drbd_tl_epoch *b, *nob; /* next old barrier */
270 struct list_head *le, *tle;
271 struct drbd_request *r;
272
273 spin_lock_irq(&mdev->req_lock);
274
275 b = mdev->oldest_tle;
276
277 /* first some paranoia code */
278 if (b == NULL) {
279 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
280 barrier_nr);
281 goto bail;
282 }
283 if (b->br_number != barrier_nr) {
284 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
285 barrier_nr, b->br_number);
286 goto bail;
287 }
7e602c0a
PR
288 if (b->n_writes != set_size) {
289 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
290 barrier_nr, set_size, b->n_writes);
b411b363
PR
291 goto bail;
292 }
293
294 /* Clean up list of requests processed during current epoch */
295 list_for_each_safe(le, tle, &b->requests) {
296 r = list_entry(le, struct drbd_request, tl_requests);
297 _req_mod(r, barrier_acked);
298 }
299 /* There could be requests on the list waiting for completion
300 of the write to the local disk. To avoid corruptions of
301 slab's data structures we have to remove the lists head.
302
303 Also there could have been a barrier ack out of sequence, overtaking
304 the write acks - which would be a bug and violating write ordering.
305 To not deadlock in case we lose connection while such requests are
306 still pending, we need some way to find them for the
307 _req_mode(connection_lost_while_pending).
308
309 These have been list_move'd to the out_of_sequence_requests list in
310 _req_mod(, barrier_acked) above.
311 */
312 list_del_init(&b->requests);
313
314 nob = b->next;
315 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
316 _tl_add_barrier(mdev, b);
317 if (nob)
318 mdev->oldest_tle = nob;
319 /* if nob == NULL b was the only barrier, and becomes the new
320 barrier. Therefore mdev->oldest_tle points already to b */
321 } else {
322 D_ASSERT(nob != NULL);
323 mdev->oldest_tle = nob;
324 kfree(b);
325 }
326
327 spin_unlock_irq(&mdev->req_lock);
328 dec_ap_pending(mdev);
329
330 return;
331
332bail:
333 spin_unlock_irq(&mdev->req_lock);
334 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
335}
336
b411b363 337/**
11b58e73 338 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 339 * @mdev: DRBD device.
11b58e73 340 * @what: The action/event to perform with all request objects
b411b363 341 *
11b58e73
PR
342 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
343 * restart_frozen_disk_io.
b411b363 344 */
11b58e73 345static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
b411b363 346{
11b58e73 347 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 348 struct list_head *le, *tle, carry_reads;
11b58e73
PR
349 struct drbd_request *req;
350 int rv, n_writes, n_reads;
b411b363
PR
351
352 b = mdev->oldest_tle;
11b58e73 353 pn = &mdev->oldest_tle;
b411b363 354 while (b) {
11b58e73
PR
355 n_writes = 0;
356 n_reads = 0;
b9b98716 357 INIT_LIST_HEAD(&carry_reads);
b411b363 358 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
359 req = list_entry(le, struct drbd_request, tl_requests);
360 rv = _req_mod(req, what);
361
362 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
363 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
364 }
365 tmp = b->next;
366
b9b98716 367 if (n_writes) {
11b58e73
PR
368 if (what == resend) {
369 b->n_writes = n_writes;
370 if (b->w.cb == NULL) {
371 b->w.cb = w_send_barrier;
372 inc_ap_pending(mdev);
373 set_bit(CREATE_BARRIER, &mdev->flags);
374 }
375
376 drbd_queue_work(&mdev->data.work, &b->w);
377 }
378 pn = &b->next;
379 } else {
b9b98716
PR
380 if (n_reads)
381 list_add(&carry_reads, &b->requests);
11b58e73
PR
382 /* there could still be requests on that ring list,
383 * in case local io is still pending */
384 list_del(&b->requests);
385
386 /* dec_ap_pending corresponding to queue_barrier.
387 * the newest barrier may not have been queued yet,
388 * in which case w.cb is still NULL. */
389 if (b->w.cb != NULL)
390 dec_ap_pending(mdev);
391
392 if (b == mdev->newest_tle) {
393 /* recycle, but reinit! */
394 D_ASSERT(tmp == NULL);
395 INIT_LIST_HEAD(&b->requests);
b9b98716 396 list_splice(&carry_reads, &b->requests);
11b58e73
PR
397 INIT_LIST_HEAD(&b->w.list);
398 b->w.cb = NULL;
399 b->br_number = net_random();
400 b->n_writes = 0;
401
402 *pn = b;
403 break;
404 }
405 *pn = tmp;
406 kfree(b);
b411b363 407 }
b411b363 408 b = tmp;
b9b98716 409 list_splice(&carry_reads, &b->requests);
b411b363 410 }
11b58e73
PR
411}
412
413
414/**
415 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
416 * @mdev: DRBD device.
417 *
418 * This is called after the connection to the peer was lost. The storage covered
419 * by the requests on the transfer gets marked as our of sync. Called from the
420 * receiver thread and the worker thread.
421 */
422void tl_clear(struct drbd_conf *mdev)
423{
424 struct list_head *le, *tle;
425 struct drbd_request *r;
426
427 spin_lock_irq(&mdev->req_lock);
428
429 _tl_restart(mdev, connection_lost_while_pending);
b411b363
PR
430
431 /* we expect this list to be empty. */
432 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
433
434 /* but just in case, clean it up anyways! */
435 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
436 r = list_entry(le, struct drbd_request, tl_requests);
437 /* It would be nice to complete outside of spinlock.
438 * But this is easier for now. */
439 _req_mod(r, connection_lost_while_pending);
440 }
441
442 /* ensure bit indicating barrier is required is clear */
443 clear_bit(CREATE_BARRIER, &mdev->flags);
444
288f422e
PR
445 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
446
b411b363
PR
447 spin_unlock_irq(&mdev->req_lock);
448}
449
11b58e73
PR
450void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
451{
452 spin_lock_irq(&mdev->req_lock);
453 _tl_restart(mdev, what);
454 spin_unlock_irq(&mdev->req_lock);
455}
456
b411b363
PR
457/**
458 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
459 * @mdev: DRBD device.
460 * @os: old (current) state.
461 * @ns: new (wanted) state.
462 */
463static int cl_wide_st_chg(struct drbd_conf *mdev,
464 union drbd_state os, union drbd_state ns)
465{
466 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
467 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
468 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
469 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
470 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
471 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
472 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
473}
474
475int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
476 union drbd_state mask, union drbd_state val)
477{
478 unsigned long flags;
479 union drbd_state os, ns;
480 int rv;
481
482 spin_lock_irqsave(&mdev->req_lock, flags);
483 os = mdev->state;
484 ns.i = (os.i & ~mask.i) | val.i;
485 rv = _drbd_set_state(mdev, ns, f, NULL);
486 ns = mdev->state;
487 spin_unlock_irqrestore(&mdev->req_lock, flags);
488
489 return rv;
490}
491
492/**
493 * drbd_force_state() - Impose a change which happens outside our control on our state
494 * @mdev: DRBD device.
495 * @mask: mask of state bits to change.
496 * @val: value of new state bits.
497 */
498void drbd_force_state(struct drbd_conf *mdev,
499 union drbd_state mask, union drbd_state val)
500{
501 drbd_change_state(mdev, CS_HARD, mask, val);
502}
503
504static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
505static int is_valid_state_transition(struct drbd_conf *,
506 union drbd_state, union drbd_state);
507static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
02bc7174 508 union drbd_state ns, const char **warn_sync_abort);
b411b363
PR
509int drbd_send_state_req(struct drbd_conf *,
510 union drbd_state, union drbd_state);
511
512static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
513 union drbd_state mask, union drbd_state val)
514{
515 union drbd_state os, ns;
516 unsigned long flags;
517 int rv;
518
519 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
520 return SS_CW_SUCCESS;
521
522 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
523 return SS_CW_FAILED_BY_PEER;
524
525 rv = 0;
526 spin_lock_irqsave(&mdev->req_lock, flags);
527 os = mdev->state;
528 ns.i = (os.i & ~mask.i) | val.i;
529 ns = sanitize_state(mdev, os, ns, NULL);
530
531 if (!cl_wide_st_chg(mdev, os, ns))
532 rv = SS_CW_NO_NEED;
533 if (!rv) {
534 rv = is_valid_state(mdev, ns);
535 if (rv == SS_SUCCESS) {
536 rv = is_valid_state_transition(mdev, ns, os);
537 if (rv == SS_SUCCESS)
538 rv = 0; /* cont waiting, otherwise fail. */
539 }
540 }
541 spin_unlock_irqrestore(&mdev->req_lock, flags);
542
543 return rv;
544}
545
546/**
547 * drbd_req_state() - Perform an eventually cluster wide state change
548 * @mdev: DRBD device.
549 * @mask: mask of state bits to change.
550 * @val: value of new state bits.
551 * @f: flags
552 *
553 * Should not be called directly, use drbd_request_state() or
554 * _drbd_request_state().
555 */
556static int drbd_req_state(struct drbd_conf *mdev,
557 union drbd_state mask, union drbd_state val,
558 enum chg_state_flags f)
559{
560 struct completion done;
561 unsigned long flags;
562 union drbd_state os, ns;
563 int rv;
564
565 init_completion(&done);
566
567 if (f & CS_SERIALIZE)
568 mutex_lock(&mdev->state_mutex);
569
570 spin_lock_irqsave(&mdev->req_lock, flags);
571 os = mdev->state;
572 ns.i = (os.i & ~mask.i) | val.i;
573 ns = sanitize_state(mdev, os, ns, NULL);
574
575 if (cl_wide_st_chg(mdev, os, ns)) {
576 rv = is_valid_state(mdev, ns);
577 if (rv == SS_SUCCESS)
578 rv = is_valid_state_transition(mdev, ns, os);
579 spin_unlock_irqrestore(&mdev->req_lock, flags);
580
581 if (rv < SS_SUCCESS) {
582 if (f & CS_VERBOSE)
583 print_st_err(mdev, os, ns, rv);
584 goto abort;
585 }
586
587 drbd_state_lock(mdev);
588 if (!drbd_send_state_req(mdev, mask, val)) {
589 drbd_state_unlock(mdev);
590 rv = SS_CW_FAILED_BY_PEER;
591 if (f & CS_VERBOSE)
592 print_st_err(mdev, os, ns, rv);
593 goto abort;
594 }
595
596 wait_event(mdev->state_wait,
597 (rv = _req_st_cond(mdev, mask, val)));
598
599 if (rv < SS_SUCCESS) {
600 drbd_state_unlock(mdev);
601 if (f & CS_VERBOSE)
602 print_st_err(mdev, os, ns, rv);
603 goto abort;
604 }
605 spin_lock_irqsave(&mdev->req_lock, flags);
606 os = mdev->state;
607 ns.i = (os.i & ~mask.i) | val.i;
608 rv = _drbd_set_state(mdev, ns, f, &done);
609 drbd_state_unlock(mdev);
610 } else {
611 rv = _drbd_set_state(mdev, ns, f, &done);
612 }
613
614 spin_unlock_irqrestore(&mdev->req_lock, flags);
615
616 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
617 D_ASSERT(current != mdev->worker.task);
618 wait_for_completion(&done);
619 }
620
621abort:
622 if (f & CS_SERIALIZE)
623 mutex_unlock(&mdev->state_mutex);
624
625 return rv;
626}
627
628/**
629 * _drbd_request_state() - Request a state change (with flags)
630 * @mdev: DRBD device.
631 * @mask: mask of state bits to change.
632 * @val: value of new state bits.
633 * @f: flags
634 *
635 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
636 * flag, or when logging of failed state change requests is not desired.
637 */
638int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
639 union drbd_state val, enum chg_state_flags f)
640{
641 int rv;
642
643 wait_event(mdev->state_wait,
644 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
645
646 return rv;
647}
648
649static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
650{
651 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
652 name,
653 drbd_conn_str(ns.conn),
654 drbd_role_str(ns.role),
655 drbd_role_str(ns.peer),
656 drbd_disk_str(ns.disk),
657 drbd_disk_str(ns.pdsk),
fb22c402 658 is_susp(ns) ? 's' : 'r',
b411b363
PR
659 ns.aftr_isp ? 'a' : '-',
660 ns.peer_isp ? 'p' : '-',
661 ns.user_isp ? 'u' : '-'
662 );
663}
664
665void print_st_err(struct drbd_conf *mdev,
666 union drbd_state os, union drbd_state ns, int err)
667{
668 if (err == SS_IN_TRANSIENT_STATE)
669 return;
670 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
671 print_st(mdev, " state", os);
672 print_st(mdev, "wanted", ns);
673}
674
675
676#define drbd_peer_str drbd_role_str
677#define drbd_pdsk_str drbd_disk_str
678
679#define drbd_susp_str(A) ((A) ? "1" : "0")
680#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
681#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
682#define drbd_user_isp_str(A) ((A) ? "1" : "0")
683
684#define PSC(A) \
685 ({ if (ns.A != os.A) { \
686 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
687 drbd_##A##_str(os.A), \
688 drbd_##A##_str(ns.A)); \
689 } })
690
691/**
692 * is_valid_state() - Returns an SS_ error code if ns is not valid
693 * @mdev: DRBD device.
694 * @ns: State to consider.
695 */
696static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
697{
698 /* See drbd_state_sw_errors in drbd_strings.c */
699
700 enum drbd_fencing_p fp;
701 int rv = SS_SUCCESS;
702
703 fp = FP_DONT_CARE;
704 if (get_ldev(mdev)) {
705 fp = mdev->ldev->dc.fencing;
706 put_ldev(mdev);
707 }
708
709 if (get_net_conf(mdev)) {
710 if (!mdev->net_conf->two_primaries &&
711 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
712 rv = SS_TWO_PRIMARIES;
713 put_net_conf(mdev);
714 }
715
716 if (rv <= 0)
717 /* already found a reason to abort */;
718 else if (ns.role == R_SECONDARY && mdev->open_cnt)
719 rv = SS_DEVICE_IN_USE;
720
721 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
722 rv = SS_NO_UP_TO_DATE_DISK;
723
724 else if (fp >= FP_RESOURCE &&
725 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
726 rv = SS_PRIMARY_NOP;
727
728 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
729 rv = SS_NO_UP_TO_DATE_DISK;
730
731 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
732 rv = SS_NO_LOCAL_DISK;
733
734 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
735 rv = SS_NO_REMOTE_DISK;
736
8d4ce82b
LE
737 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
738 rv = SS_NO_UP_TO_DATE_DISK;
739
b411b363
PR
740 else if ((ns.conn == C_CONNECTED ||
741 ns.conn == C_WF_BITMAP_S ||
742 ns.conn == C_SYNC_SOURCE ||
743 ns.conn == C_PAUSED_SYNC_S) &&
744 ns.disk == D_OUTDATED)
745 rv = SS_CONNECTED_OUTDATES;
746
747 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
748 (mdev->sync_conf.verify_alg[0] == 0))
749 rv = SS_NO_VERIFY_ALG;
750
751 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
752 mdev->agreed_pro_version < 88)
753 rv = SS_NOT_SUPPORTED;
754
755 return rv;
756}
757
758/**
759 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
760 * @mdev: DRBD device.
761 * @ns: new state.
762 * @os: old state.
763 */
764static int is_valid_state_transition(struct drbd_conf *mdev,
765 union drbd_state ns, union drbd_state os)
766{
767 int rv = SS_SUCCESS;
768
769 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
770 os.conn > C_CONNECTED)
771 rv = SS_RESYNC_RUNNING;
772
773 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
774 rv = SS_ALREADY_STANDALONE;
775
776 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
777 rv = SS_IS_DISKLESS;
778
779 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
780 rv = SS_NO_NET_CONFIG;
781
782 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
783 rv = SS_LOWER_THAN_OUTDATED;
784
785 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
786 rv = SS_IN_TRANSIENT_STATE;
787
788 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
789 rv = SS_IN_TRANSIENT_STATE;
790
791 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
792 rv = SS_NEED_CONNECTION;
793
794 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
795 ns.conn != os.conn && os.conn > C_CONNECTED)
796 rv = SS_RESYNC_RUNNING;
797
798 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
799 os.conn < C_CONNECTED)
800 rv = SS_NEED_CONNECTION;
801
802 return rv;
803}
804
805/**
806 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
807 * @mdev: DRBD device.
808 * @os: old state.
809 * @ns: new state.
810 * @warn_sync_abort:
811 *
812 * When we loose connection, we have to set the state of the peers disk (pdsk)
813 * to D_UNKNOWN. This rule and many more along those lines are in this function.
814 */
815static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
02bc7174 816 union drbd_state ns, const char **warn_sync_abort)
b411b363
PR
817{
818 enum drbd_fencing_p fp;
819
820 fp = FP_DONT_CARE;
821 if (get_ldev(mdev)) {
822 fp = mdev->ldev->dc.fencing;
823 put_ldev(mdev);
824 }
825
826 /* Disallow Network errors to configure a device's network part */
827 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
828 os.conn <= C_DISCONNECTING)
829 ns.conn = os.conn;
830
f2906e18
LE
831 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
832 * If you try to go into some Sync* state, that shall fail (elsewhere). */
b411b363 833 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
f2906e18 834 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
b411b363
PR
835 ns.conn = os.conn;
836
82f59cc6
LE
837 /* we cannot fail (again) if we already detached */
838 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
839 ns.disk = D_DISKLESS;
840
841 /* if we are only D_ATTACHING yet,
842 * we can (and should) go directly to D_DISKLESS. */
843 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
844 ns.disk = D_DISKLESS;
845
b411b363
PR
846 /* After C_DISCONNECTING only C_STANDALONE may follow */
847 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
848 ns.conn = os.conn;
849
850 if (ns.conn < C_CONNECTED) {
851 ns.peer_isp = 0;
852 ns.peer = R_UNKNOWN;
853 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
854 ns.pdsk = D_UNKNOWN;
855 }
856
857 /* Clear the aftr_isp when becoming unconfigured */
858 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
859 ns.aftr_isp = 0;
860
b411b363
PR
861 /* Abort resync if a disk fails/detaches */
862 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
863 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
864 if (warn_sync_abort)
02bc7174
LE
865 *warn_sync_abort =
866 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
867 "Online-verify" : "Resync";
b411b363
PR
868 ns.conn = C_CONNECTED;
869 }
870
871 if (ns.conn >= C_CONNECTED &&
872 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
873 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
874 switch (ns.conn) {
875 case C_WF_BITMAP_T:
876 case C_PAUSED_SYNC_T:
877 ns.disk = D_OUTDATED;
878 break;
879 case C_CONNECTED:
880 case C_WF_BITMAP_S:
881 case C_SYNC_SOURCE:
882 case C_PAUSED_SYNC_S:
883 ns.disk = D_UP_TO_DATE;
884 break;
885 case C_SYNC_TARGET:
886 ns.disk = D_INCONSISTENT;
887 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
888 break;
889 }
890 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
891 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
892 }
893
894 if (ns.conn >= C_CONNECTED &&
895 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
896 switch (ns.conn) {
897 case C_CONNECTED:
898 case C_WF_BITMAP_T:
899 case C_PAUSED_SYNC_T:
900 case C_SYNC_TARGET:
901 ns.pdsk = D_UP_TO_DATE;
902 break;
903 case C_WF_BITMAP_S:
904 case C_PAUSED_SYNC_S:
e0f83012
LE
905 /* remap any consistent state to D_OUTDATED,
906 * but disallow "upgrade" of not even consistent states.
907 */
908 ns.pdsk =
909 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
910 ? os.pdsk : D_OUTDATED;
b411b363
PR
911 break;
912 case C_SYNC_SOURCE:
913 ns.pdsk = D_INCONSISTENT;
914 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
915 break;
916 }
917 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
918 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
919 }
920
921 /* Connection breaks down before we finished "Negotiating" */
922 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
923 get_ldev_if_state(mdev, D_NEGOTIATING)) {
924 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
925 ns.disk = mdev->new_state_tmp.disk;
926 ns.pdsk = mdev->new_state_tmp.pdsk;
927 } else {
928 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
929 ns.disk = D_DISKLESS;
930 ns.pdsk = D_UNKNOWN;
931 }
932 put_ldev(mdev);
933 }
934
935 if (fp == FP_STONITH &&
0a492166
PR
936 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
937 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
fb22c402 938 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
265be2d0
PR
939
940 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
941 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
942 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
fb22c402 943 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
b411b363
PR
944
945 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
946 if (ns.conn == C_SYNC_SOURCE)
947 ns.conn = C_PAUSED_SYNC_S;
948 if (ns.conn == C_SYNC_TARGET)
949 ns.conn = C_PAUSED_SYNC_T;
950 } else {
951 if (ns.conn == C_PAUSED_SYNC_S)
952 ns.conn = C_SYNC_SOURCE;
953 if (ns.conn == C_PAUSED_SYNC_T)
954 ns.conn = C_SYNC_TARGET;
955 }
956
957 return ns;
958}
959
960/* helper for __drbd_set_state */
961static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
962{
963 if (cs == C_VERIFY_T) {
964 /* starting online verify from an arbitrary position
965 * does not fit well into the existing protocol.
966 * on C_VERIFY_T, we initialize ov_left and friends
967 * implicitly in receive_DataRequest once the
968 * first P_OV_REQUEST is received */
969 mdev->ov_start_sector = ~(sector_t)0;
970 } else {
971 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
972 if (bit >= mdev->rs_total)
973 mdev->ov_start_sector =
974 BM_BIT_TO_SECT(mdev->rs_total - 1);
975 mdev->ov_position = mdev->ov_start_sector;
976 }
977}
978
0778286a
PR
979static void drbd_resume_al(struct drbd_conf *mdev)
980{
981 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
982 dev_info(DEV, "Resumed AL updates\n");
983}
984
b411b363
PR
985/**
986 * __drbd_set_state() - Set a new DRBD state
987 * @mdev: DRBD device.
988 * @ns: new state.
989 * @flags: Flags
990 * @done: Optional completion, that will get completed after the after_state_ch() finished
991 *
992 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
993 */
994int __drbd_set_state(struct drbd_conf *mdev,
995 union drbd_state ns, enum chg_state_flags flags,
996 struct completion *done)
997{
998 union drbd_state os;
999 int rv = SS_SUCCESS;
02bc7174 1000 const char *warn_sync_abort = NULL;
b411b363
PR
1001 struct after_state_chg_work *ascw;
1002
1003 os = mdev->state;
1004
1005 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1006
1007 if (ns.i == os.i)
1008 return SS_NOTHING_TO_DO;
1009
1010 if (!(flags & CS_HARD)) {
1011 /* pre-state-change checks ; only look at ns */
1012 /* See drbd_state_sw_errors in drbd_strings.c */
1013
1014 rv = is_valid_state(mdev, ns);
1015 if (rv < SS_SUCCESS) {
1016 /* If the old state was illegal as well, then let
1017 this happen...*/
1018
1616a254 1019 if (is_valid_state(mdev, os) == rv)
b411b363 1020 rv = is_valid_state_transition(mdev, ns, os);
b411b363
PR
1021 } else
1022 rv = is_valid_state_transition(mdev, ns, os);
1023 }
1024
1025 if (rv < SS_SUCCESS) {
1026 if (flags & CS_VERBOSE)
1027 print_st_err(mdev, os, ns, rv);
1028 return rv;
1029 }
1030
1031 if (warn_sync_abort)
02bc7174 1032 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
b411b363
PR
1033
1034 {
1035 char *pbp, pb[300];
1036 pbp = pb;
1037 *pbp = 0;
1038 PSC(role);
1039 PSC(peer);
1040 PSC(conn);
1041 PSC(disk);
1042 PSC(pdsk);
fb22c402
PR
1043 if (is_susp(ns) != is_susp(os))
1044 pbp += sprintf(pbp, "susp( %s -> %s ) ",
1045 drbd_susp_str(is_susp(os)),
1046 drbd_susp_str(is_susp(ns)));
b411b363
PR
1047 PSC(aftr_isp);
1048 PSC(peer_isp);
1049 PSC(user_isp);
1050 dev_info(DEV, "%s\n", pb);
1051 }
1052
1053 /* solve the race between becoming unconfigured,
1054 * worker doing the cleanup, and
1055 * admin reconfiguring us:
1056 * on (re)configure, first set CONFIG_PENDING,
1057 * then wait for a potentially exiting worker,
1058 * start the worker, and schedule one no_op.
1059 * then proceed with configuration.
1060 */
1061 if (ns.disk == D_DISKLESS &&
1062 ns.conn == C_STANDALONE &&
1063 ns.role == R_SECONDARY &&
1064 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1065 set_bit(DEVICE_DYING, &mdev->flags);
1066
82f59cc6
LE
1067 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1068 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1069 * drbd_ldev_destroy() won't happen before our corresponding
1070 * after_state_ch works run, where we put_ldev again. */
1071 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1072 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1073 atomic_inc(&mdev->local_cnt);
1074
1075 mdev->state = ns;
b411b363
PR
1076 wake_up(&mdev->misc_wait);
1077 wake_up(&mdev->state_wait);
1078
b411b363
PR
1079 /* aborted verify run. log the last position */
1080 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1081 ns.conn < C_CONNECTED) {
1082 mdev->ov_start_sector =
1083 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1084 dev_info(DEV, "Online Verify reached sector %llu\n",
1085 (unsigned long long)mdev->ov_start_sector);
1086 }
1087
1088 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1089 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1090 dev_info(DEV, "Syncer continues.\n");
1d7734a0
LE
1091 mdev->rs_paused += (long)jiffies
1092 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
63106d3c
PR
1093 if (ns.conn == C_SYNC_TARGET)
1094 mod_timer(&mdev->resync_timer, jiffies);
b411b363
PR
1095 }
1096
1097 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1098 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1099 dev_info(DEV, "Resync suspended\n");
1d7734a0 1100 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
b411b363
PR
1101 }
1102
1103 if (os.conn == C_CONNECTED &&
1104 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1d7734a0
LE
1105 unsigned long now = jiffies;
1106 int i;
1107
b411b363 1108 mdev->ov_position = 0;
1d7734a0 1109 mdev->rs_total = drbd_bm_bits(mdev);
b411b363
PR
1110 if (mdev->agreed_pro_version >= 90)
1111 set_ov_position(mdev, ns.conn);
1112 else
1113 mdev->ov_start_sector = 0;
1114 mdev->ov_left = mdev->rs_total
1115 - BM_SECT_TO_BIT(mdev->ov_position);
1d7734a0 1116 mdev->rs_start = now;
0f0601f4
LE
1117 mdev->rs_last_events = 0;
1118 mdev->rs_last_sect_ev = 0;
b411b363
PR
1119 mdev->ov_last_oos_size = 0;
1120 mdev->ov_last_oos_start = 0;
1121
1d7734a0
LE
1122 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1123 mdev->rs_mark_left[i] = mdev->rs_total;
1124 mdev->rs_mark_time[i] = now;
1125 }
1126
b411b363
PR
1127 if (ns.conn == C_VERIFY_S) {
1128 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1129 (unsigned long long)mdev->ov_position);
1130 mod_timer(&mdev->resync_timer, jiffies);
1131 }
1132 }
1133
1134 if (get_ldev(mdev)) {
1135 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1136 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1137 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1138
1139 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1140 mdf |= MDF_CRASHED_PRIMARY;
1141 if (mdev->state.role == R_PRIMARY ||
1142 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1143 mdf |= MDF_PRIMARY_IND;
1144 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1145 mdf |= MDF_CONNECTED_IND;
1146 if (mdev->state.disk > D_INCONSISTENT)
1147 mdf |= MDF_CONSISTENT;
1148 if (mdev->state.disk > D_OUTDATED)
1149 mdf |= MDF_WAS_UP_TO_DATE;
1150 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1151 mdf |= MDF_PEER_OUT_DATED;
1152 if (mdf != mdev->ldev->md.flags) {
1153 mdev->ldev->md.flags = mdf;
1154 drbd_md_mark_dirty(mdev);
1155 }
1156 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1157 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1158 put_ldev(mdev);
1159 }
1160
1161 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1162 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1163 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1164 set_bit(CONSIDER_RESYNC, &mdev->flags);
1165
1166 /* Receiver should clean up itself */
1167 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1168 drbd_thread_stop_nowait(&mdev->receiver);
1169
1170 /* Now the receiver finished cleaning up itself, it should die */
1171 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1172 drbd_thread_stop_nowait(&mdev->receiver);
1173
1174 /* Upon network failure, we need to restart the receiver. */
1175 if (os.conn > C_TEAR_DOWN &&
1176 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1177 drbd_thread_restart_nowait(&mdev->receiver);
1178
0778286a
PR
1179 /* Resume AL writing if we get a connection */
1180 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1181 drbd_resume_al(mdev);
1182
b411b363
PR
1183 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1184 if (ascw) {
1185 ascw->os = os;
1186 ascw->ns = ns;
1187 ascw->flags = flags;
1188 ascw->w.cb = w_after_state_ch;
1189 ascw->done = done;
1190 drbd_queue_work(&mdev->data.work, &ascw->w);
1191 } else {
1192 dev_warn(DEV, "Could not kmalloc an ascw\n");
1193 }
1194
1195 return rv;
1196}
1197
1198static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1199{
1200 struct after_state_chg_work *ascw =
1201 container_of(w, struct after_state_chg_work, w);
1202 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1203 if (ascw->flags & CS_WAIT_COMPLETE) {
1204 D_ASSERT(ascw->done != NULL);
1205 complete(ascw->done);
1206 }
1207 kfree(ascw);
1208
1209 return 1;
1210}
1211
1212static void abw_start_sync(struct drbd_conf *mdev, int rv)
1213{
1214 if (rv) {
1215 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1216 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1217 return;
1218 }
1219
1220 switch (mdev->state.conn) {
1221 case C_STARTING_SYNC_T:
1222 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1223 break;
1224 case C_STARTING_SYNC_S:
1225 drbd_start_resync(mdev, C_SYNC_SOURCE);
1226 break;
1227 }
1228}
1229
1230/**
1231 * after_state_ch() - Perform after state change actions that may sleep
1232 * @mdev: DRBD device.
1233 * @os: old state.
1234 * @ns: new state.
1235 * @flags: Flags
1236 */
1237static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1238 union drbd_state ns, enum chg_state_flags flags)
1239{
1240 enum drbd_fencing_p fp;
67098930 1241 enum drbd_req_event what = nothing;
fb22c402 1242 union drbd_state nsm = (union drbd_state){ .i = -1 };
b411b363
PR
1243
1244 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1245 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1246 if (mdev->p_uuid)
1247 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1248 }
1249
1250 fp = FP_DONT_CARE;
1251 if (get_ldev(mdev)) {
1252 fp = mdev->ldev->dc.fencing;
1253 put_ldev(mdev);
1254 }
1255
1256 /* Inform userspace about the change... */
1257 drbd_bcast_state(mdev, ns);
1258
1259 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1260 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1261 drbd_khelper(mdev, "pri-on-incon-degr");
1262
1263 /* Here we have the actions that are performed after a
1264 state change. This function might sleep */
1265
fb22c402
PR
1266 nsm.i = -1;
1267 if (ns.susp_nod) {
265be2d0 1268 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
67098930 1269 if (ns.conn == C_CONNECTED)
fb22c402 1270 what = resend, nsm.susp_nod = 0;
67098930 1271 else /* ns.conn > C_CONNECTED */
265be2d0
PR
1272 dev_err(DEV, "Unexpected Resynd going on!\n");
1273 }
1274
67098930 1275 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
fb22c402
PR
1276 what = restart_frozen_disk_io, nsm.susp_nod = 0;
1277
265be2d0
PR
1278 }
1279
fb22c402 1280 if (ns.susp_fen) {
43a5182c
PR
1281 /* case1: The outdate peer handler is successful: */
1282 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
b411b363 1283 tl_clear(mdev);
43a5182c
PR
1284 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1285 drbd_uuid_new_current(mdev);
1286 clear_bit(NEW_CUR_UUID, &mdev->flags);
43a5182c 1287 }
b411b363 1288 spin_lock_irq(&mdev->req_lock);
fb22c402 1289 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
b411b363
PR
1290 spin_unlock_irq(&mdev->req_lock);
1291 }
43a5182c
PR
1292 /* case2: The connection was established again: */
1293 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1294 clear_bit(NEW_CUR_UUID, &mdev->flags);
67098930 1295 what = resend;
fb22c402 1296 nsm.susp_fen = 0;
43a5182c 1297 }
b411b363 1298 }
67098930
PR
1299
1300 if (what != nothing) {
1301 spin_lock_irq(&mdev->req_lock);
1302 _tl_restart(mdev, what);
fb22c402
PR
1303 nsm.i &= mdev->state.i;
1304 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
67098930
PR
1305 spin_unlock_irq(&mdev->req_lock);
1306 }
1307
b411b363
PR
1308 /* Do not change the order of the if above and the two below... */
1309 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1310 drbd_send_uuids(mdev);
1311 drbd_send_state(mdev);
1312 }
1313 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1314 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1315
1316 /* Lost contact to peer's copy of the data */
1317 if ((os.pdsk >= D_INCONSISTENT &&
1318 os.pdsk != D_UNKNOWN &&
1319 os.pdsk != D_OUTDATED)
1320 && (ns.pdsk < D_INCONSISTENT ||
1321 ns.pdsk == D_UNKNOWN ||
1322 ns.pdsk == D_OUTDATED)) {
b411b363
PR
1323 if (get_ldev(mdev)) {
1324 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
2c8d1967 1325 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
fb22c402 1326 if (is_susp(mdev->state)) {
43a5182c
PR
1327 set_bit(NEW_CUR_UUID, &mdev->flags);
1328 } else {
1329 drbd_uuid_new_current(mdev);
1330 drbd_send_uuids(mdev);
1331 }
2c8d1967 1332 }
b411b363
PR
1333 put_ldev(mdev);
1334 }
1335 }
1336
1337 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
18a50fa2 1338 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
2c8d1967 1339 drbd_uuid_new_current(mdev);
18a50fa2
PR
1340 drbd_send_uuids(mdev);
1341 }
b411b363
PR
1342
1343 /* D_DISKLESS Peer becomes secondary */
1344 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1345 drbd_al_to_on_disk_bm(mdev);
1346 put_ldev(mdev);
1347 }
1348
1349 /* Last part of the attaching process ... */
1350 if (ns.conn >= C_CONNECTED &&
1351 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
e89b591c 1352 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
b411b363
PR
1353 drbd_send_uuids(mdev);
1354 drbd_send_state(mdev);
1355 }
1356
1357 /* We want to pause/continue resync, tell peer. */
1358 if (ns.conn >= C_CONNECTED &&
1359 ((os.aftr_isp != ns.aftr_isp) ||
1360 (os.user_isp != ns.user_isp)))
1361 drbd_send_state(mdev);
1362
1363 /* In case one of the isp bits got set, suspend other devices. */
1364 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1365 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1366 suspend_other_sg(mdev);
1367
1368 /* Make sure the peer gets informed about eventual state
1369 changes (ISP bits) while we were in WFReportParams. */
1370 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1371 drbd_send_state(mdev);
1372
1373 /* We are in the progress to start a full sync... */
1374 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1375 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1376 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1377
1378 /* We are invalidating our self... */
1379 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1380 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1381 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1382
82f59cc6
LE
1383 /* first half of local IO error, failure to attach,
1384 * or administrative detach */
1385 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1386 enum drbd_io_error_p eh;
1387 int was_io_error;
1388 /* corresponding get_ldev was in __drbd_set_state, to serialize
1389 * our cleanup here with the transition to D_DISKLESS,
1390 * so it is safe to dreference ldev here. */
1391 eh = mdev->ldev->dc.on_io_error;
1392 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1393
1394 /* current state still has to be D_FAILED,
1395 * there is only one way out: to D_DISKLESS,
1396 * and that may only happen after our put_ldev below. */
1397 if (mdev->state.disk != D_FAILED)
1398 dev_err(DEV,
1399 "ASSERT FAILED: disk is %s during detach\n",
1400 drbd_disk_str(mdev->state.disk));
e9e6f3ec
LE
1401
1402 if (drbd_send_state(mdev))
82f59cc6 1403 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
e9e6f3ec 1404 else
82f59cc6 1405 dev_err(DEV, "Sending state for detaching disk failed\n");
e9e6f3ec
LE
1406
1407 drbd_rs_cancel_all(mdev);
b411b363 1408
82f59cc6
LE
1409 /* In case we want to get something to stable storage still,
1410 * this may be the last chance.
1411 * Following put_ldev may transition to D_DISKLESS. */
1412 drbd_md_sync(mdev);
1413 put_ldev(mdev);
1414
1415 if (was_io_error && eh == EP_CALL_HELPER)
e9e6f3ec
LE
1416 drbd_khelper(mdev, "local-io-error");
1417 }
b411b363 1418
82f59cc6
LE
1419 /* second half of local IO error, failure to attach,
1420 * or administrative detach,
1421 * after local_cnt references have reached zero again */
1422 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1423 /* We must still be diskless,
1424 * re-attach has to be serialized with this! */
1425 if (mdev->state.disk != D_DISKLESS)
1426 dev_err(DEV,
1427 "ASSERT FAILED: disk is %s while going diskless\n",
1428 drbd_disk_str(mdev->state.disk));
e9e6f3ec 1429
82f59cc6
LE
1430 mdev->rs_total = 0;
1431 mdev->rs_failed = 0;
1432 atomic_set(&mdev->rs_pending_cnt, 0);
9d282875 1433
e9e6f3ec 1434 if (drbd_send_state(mdev))
82f59cc6 1435 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
e9e6f3ec 1436 else
82f59cc6
LE
1437 dev_err(DEV, "Sending state for being diskless failed\n");
1438 /* corresponding get_ldev in __drbd_set_state
1439 * this may finaly trigger drbd_ldev_destroy. */
1440 put_ldev(mdev);
b411b363
PR
1441 }
1442
1443 /* Disks got bigger while they were detached */
1444 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1445 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1446 if (ns.conn == C_CONNECTED)
1447 resync_after_online_grow(mdev);
1448 }
1449
1450 /* A resync finished or aborted, wake paused devices... */
1451 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1452 (os.peer_isp && !ns.peer_isp) ||
1453 (os.user_isp && !ns.user_isp))
1454 resume_next_sg(mdev);
1455
af85e8e8
LE
1456 /* sync target done with resync. Explicitly notify peer, even though
1457 * it should (at least for non-empty resyncs) already know itself. */
1458 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1459 drbd_send_state(mdev);
1460
f70b3511 1461 /* free tl_hash if we Got thawed and are C_STANDALONE */
fb22c402 1462 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
f70b3511
PR
1463 drbd_free_tl_hash(mdev);
1464
b411b363
PR
1465 /* Upon network connection, we need to start the receiver */
1466 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1467 drbd_thread_start(&mdev->receiver);
1468
1469 /* Terminate worker thread if we are unconfigured - it will be
1470 restarted as needed... */
1471 if (ns.disk == D_DISKLESS &&
1472 ns.conn == C_STANDALONE &&
1473 ns.role == R_SECONDARY) {
1474 if (os.aftr_isp != ns.aftr_isp)
1475 resume_next_sg(mdev);
1476 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1477 if (test_bit(DEVICE_DYING, &mdev->flags))
1478 drbd_thread_stop_nowait(&mdev->worker);
1479 }
1480
1481 drbd_md_sync(mdev);
1482}
1483
1484
1485static int drbd_thread_setup(void *arg)
1486{
1487 struct drbd_thread *thi = (struct drbd_thread *) arg;
1488 struct drbd_conf *mdev = thi->mdev;
1489 unsigned long flags;
1490 int retval;
1491
1492restart:
1493 retval = thi->function(thi);
1494
1495 spin_lock_irqsave(&thi->t_lock, flags);
1496
1497 /* if the receiver has been "Exiting", the last thing it did
1498 * was set the conn state to "StandAlone",
1499 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1500 * and receiver thread will be "started".
1501 * drbd_thread_start needs to set "Restarting" in that case.
1502 * t_state check and assignment needs to be within the same spinlock,
1503 * so either thread_start sees Exiting, and can remap to Restarting,
1504 * or thread_start see None, and can proceed as normal.
1505 */
1506
1507 if (thi->t_state == Restarting) {
1508 dev_info(DEV, "Restarting %s\n", current->comm);
1509 thi->t_state = Running;
1510 spin_unlock_irqrestore(&thi->t_lock, flags);
1511 goto restart;
1512 }
1513
1514 thi->task = NULL;
1515 thi->t_state = None;
1516 smp_mb();
1517 complete(&thi->stop);
1518 spin_unlock_irqrestore(&thi->t_lock, flags);
1519
1520 dev_info(DEV, "Terminating %s\n", current->comm);
1521
1522 /* Release mod reference taken when thread was started */
1523 module_put(THIS_MODULE);
1524 return retval;
1525}
1526
1527static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1528 int (*func) (struct drbd_thread *))
1529{
1530 spin_lock_init(&thi->t_lock);
1531 thi->task = NULL;
1532 thi->t_state = None;
1533 thi->function = func;
1534 thi->mdev = mdev;
1535}
1536
1537int drbd_thread_start(struct drbd_thread *thi)
1538{
1539 struct drbd_conf *mdev = thi->mdev;
1540 struct task_struct *nt;
1541 unsigned long flags;
1542
1543 const char *me =
1544 thi == &mdev->receiver ? "receiver" :
1545 thi == &mdev->asender ? "asender" :
1546 thi == &mdev->worker ? "worker" : "NONSENSE";
1547
1548 /* is used from state engine doing drbd_thread_stop_nowait,
1549 * while holding the req lock irqsave */
1550 spin_lock_irqsave(&thi->t_lock, flags);
1551
1552 switch (thi->t_state) {
1553 case None:
1554 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1555 me, current->comm, current->pid);
1556
1557 /* Get ref on module for thread - this is released when thread exits */
1558 if (!try_module_get(THIS_MODULE)) {
1559 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1560 spin_unlock_irqrestore(&thi->t_lock, flags);
1561 return FALSE;
1562 }
1563
1564 init_completion(&thi->stop);
1565 D_ASSERT(thi->task == NULL);
1566 thi->reset_cpu_mask = 1;
1567 thi->t_state = Running;
1568 spin_unlock_irqrestore(&thi->t_lock, flags);
1569 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1570
1571 nt = kthread_create(drbd_thread_setup, (void *) thi,
1572 "drbd%d_%s", mdev_to_minor(mdev), me);
1573
1574 if (IS_ERR(nt)) {
1575 dev_err(DEV, "Couldn't start thread\n");
1576
1577 module_put(THIS_MODULE);
1578 return FALSE;
1579 }
1580 spin_lock_irqsave(&thi->t_lock, flags);
1581 thi->task = nt;
1582 thi->t_state = Running;
1583 spin_unlock_irqrestore(&thi->t_lock, flags);
1584 wake_up_process(nt);
1585 break;
1586 case Exiting:
1587 thi->t_state = Restarting;
1588 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1589 me, current->comm, current->pid);
1590 /* fall through */
1591 case Running:
1592 case Restarting:
1593 default:
1594 spin_unlock_irqrestore(&thi->t_lock, flags);
1595 break;
1596 }
1597
1598 return TRUE;
1599}
1600
1601
1602void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1603{
1604 unsigned long flags;
1605
1606 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1607
1608 /* may be called from state engine, holding the req lock irqsave */
1609 spin_lock_irqsave(&thi->t_lock, flags);
1610
1611 if (thi->t_state == None) {
1612 spin_unlock_irqrestore(&thi->t_lock, flags);
1613 if (restart)
1614 drbd_thread_start(thi);
1615 return;
1616 }
1617
1618 if (thi->t_state != ns) {
1619 if (thi->task == NULL) {
1620 spin_unlock_irqrestore(&thi->t_lock, flags);
1621 return;
1622 }
1623
1624 thi->t_state = ns;
1625 smp_mb();
1626 init_completion(&thi->stop);
1627 if (thi->task != current)
1628 force_sig(DRBD_SIGKILL, thi->task);
1629
1630 }
1631
1632 spin_unlock_irqrestore(&thi->t_lock, flags);
1633
1634 if (wait)
1635 wait_for_completion(&thi->stop);
1636}
1637
1638#ifdef CONFIG_SMP
1639/**
1640 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1641 * @mdev: DRBD device.
1642 *
1643 * Forces all threads of a device onto the same CPU. This is beneficial for
1644 * DRBD's performance. May be overwritten by user's configuration.
1645 */
1646void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1647{
1648 int ord, cpu;
1649
1650 /* user override. */
1651 if (cpumask_weight(mdev->cpu_mask))
1652 return;
1653
1654 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1655 for_each_online_cpu(cpu) {
1656 if (ord-- == 0) {
1657 cpumask_set_cpu(cpu, mdev->cpu_mask);
1658 return;
1659 }
1660 }
1661 /* should not be reached */
1662 cpumask_setall(mdev->cpu_mask);
1663}
1664
1665/**
1666 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1667 * @mdev: DRBD device.
1668 *
1669 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1670 * prematurely.
1671 */
1672void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1673{
1674 struct task_struct *p = current;
1675 struct drbd_thread *thi =
1676 p == mdev->asender.task ? &mdev->asender :
1677 p == mdev->receiver.task ? &mdev->receiver :
1678 p == mdev->worker.task ? &mdev->worker :
1679 NULL;
1680 ERR_IF(thi == NULL)
1681 return;
1682 if (!thi->reset_cpu_mask)
1683 return;
1684 thi->reset_cpu_mask = 0;
1685 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1686}
1687#endif
1688
1689/* the appropriate socket mutex must be held already */
1690int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
0b70a13d 1691 enum drbd_packets cmd, struct p_header80 *h,
b411b363
PR
1692 size_t size, unsigned msg_flags)
1693{
1694 int sent, ok;
1695
1696 ERR_IF(!h) return FALSE;
1697 ERR_IF(!size) return FALSE;
1698
1699 h->magic = BE_DRBD_MAGIC;
1700 h->command = cpu_to_be16(cmd);
0b70a13d 1701 h->length = cpu_to_be16(size-sizeof(struct p_header80));
b411b363 1702
b411b363
PR
1703 sent = drbd_send(mdev, sock, h, size, msg_flags);
1704
1705 ok = (sent == size);
1706 if (!ok)
1707 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1708 cmdname(cmd), (int)size, sent);
1709 return ok;
1710}
1711
1712/* don't pass the socket. we may only look at it
1713 * when we hold the appropriate socket mutex.
1714 */
1715int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
0b70a13d 1716 enum drbd_packets cmd, struct p_header80 *h, size_t size)
b411b363
PR
1717{
1718 int ok = 0;
1719 struct socket *sock;
1720
1721 if (use_data_socket) {
1722 mutex_lock(&mdev->data.mutex);
1723 sock = mdev->data.socket;
1724 } else {
1725 mutex_lock(&mdev->meta.mutex);
1726 sock = mdev->meta.socket;
1727 }
1728
1729 /* drbd_disconnect() could have called drbd_free_sock()
1730 * while we were waiting in down()... */
1731 if (likely(sock != NULL))
1732 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1733
1734 if (use_data_socket)
1735 mutex_unlock(&mdev->data.mutex);
1736 else
1737 mutex_unlock(&mdev->meta.mutex);
1738 return ok;
1739}
1740
1741int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1742 size_t size)
1743{
0b70a13d 1744 struct p_header80 h;
b411b363
PR
1745 int ok;
1746
1747 h.magic = BE_DRBD_MAGIC;
1748 h.command = cpu_to_be16(cmd);
1749 h.length = cpu_to_be16(size);
1750
1751 if (!drbd_get_data_sock(mdev))
1752 return 0;
1753
b411b363
PR
1754 ok = (sizeof(h) ==
1755 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1756 ok = ok && (size ==
1757 drbd_send(mdev, mdev->data.socket, data, size, 0));
1758
1759 drbd_put_data_sock(mdev);
1760
1761 return ok;
1762}
1763
1764int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1765{
8e26f9cc 1766 struct p_rs_param_95 *p;
b411b363
PR
1767 struct socket *sock;
1768 int size, rv;
1769 const int apv = mdev->agreed_pro_version;
1770
1771 size = apv <= 87 ? sizeof(struct p_rs_param)
1772 : apv == 88 ? sizeof(struct p_rs_param)
1773 + strlen(mdev->sync_conf.verify_alg) + 1
8e26f9cc
PR
1774 : apv <= 94 ? sizeof(struct p_rs_param_89)
1775 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
1776
1777 /* used from admin command context and receiver/worker context.
1778 * to avoid kmalloc, grab the socket right here,
1779 * then use the pre-allocated sbuf there */
1780 mutex_lock(&mdev->data.mutex);
1781 sock = mdev->data.socket;
1782
1783 if (likely(sock != NULL)) {
1784 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1785
8e26f9cc 1786 p = &mdev->data.sbuf.rs_param_95;
b411b363
PR
1787
1788 /* initialize verify_alg and csums_alg */
1789 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1790
1791 p->rate = cpu_to_be32(sc->rate);
8e26f9cc
PR
1792 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1793 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1794 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1795 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
b411b363
PR
1796
1797 if (apv >= 88)
1798 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1799 if (apv >= 89)
1800 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1801
1802 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1803 } else
1804 rv = 0; /* not ok */
1805
1806 mutex_unlock(&mdev->data.mutex);
1807
1808 return rv;
1809}
1810
1811int drbd_send_protocol(struct drbd_conf *mdev)
1812{
1813 struct p_protocol *p;
cf14c2e9 1814 int size, cf, rv;
b411b363
PR
1815
1816 size = sizeof(struct p_protocol);
1817
1818 if (mdev->agreed_pro_version >= 87)
1819 size += strlen(mdev->net_conf->integrity_alg) + 1;
1820
1821 /* we must not recurse into our own queue,
1822 * as that is blocked during handshake */
1823 p = kmalloc(size, GFP_NOIO);
1824 if (p == NULL)
1825 return 0;
1826
1827 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1828 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1829 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1830 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
b411b363
PR
1831 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1832
cf14c2e9
PR
1833 cf = 0;
1834 if (mdev->net_conf->want_lose)
1835 cf |= CF_WANT_LOSE;
1836 if (mdev->net_conf->dry_run) {
1837 if (mdev->agreed_pro_version >= 92)
1838 cf |= CF_DRY_RUN;
1839 else {
1840 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 1841 kfree(p);
cf14c2e9
PR
1842 return 0;
1843 }
1844 }
1845 p->conn_flags = cpu_to_be32(cf);
1846
b411b363
PR
1847 if (mdev->agreed_pro_version >= 87)
1848 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1849
1850 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
0b70a13d 1851 (struct p_header80 *)p, size);
b411b363
PR
1852 kfree(p);
1853 return rv;
1854}
1855
1856int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1857{
1858 struct p_uuids p;
1859 int i;
1860
1861 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1862 return 1;
1863
1864 for (i = UI_CURRENT; i < UI_SIZE; i++)
1865 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1866
1867 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1868 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1869 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1870 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1871 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1872 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1873
1874 put_ldev(mdev);
1875
1876 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
0b70a13d 1877 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
1878}
1879
1880int drbd_send_uuids(struct drbd_conf *mdev)
1881{
1882 return _drbd_send_uuids(mdev, 0);
1883}
1884
1885int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1886{
1887 return _drbd_send_uuids(mdev, 8);
1888}
1889
1890
1891int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1892{
1893 struct p_rs_uuid p;
1894
1895 p.uuid = cpu_to_be64(val);
1896
1897 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
0b70a13d 1898 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
1899}
1900
e89b591c 1901int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
1902{
1903 struct p_sizes p;
1904 sector_t d_size, u_size;
1905 int q_order_type;
1906 int ok;
1907
1908 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1909 D_ASSERT(mdev->ldev->backing_bdev);
1910 d_size = drbd_get_max_capacity(mdev->ldev);
1911 u_size = mdev->ldev->dc.disk_size;
1912 q_order_type = drbd_queue_order_type(mdev);
b411b363
PR
1913 put_ldev(mdev);
1914 } else {
1915 d_size = 0;
1916 u_size = 0;
1917 q_order_type = QUEUE_ORDERED_NONE;
1918 }
1919
1920 p.d_size = cpu_to_be64(d_size);
1921 p.u_size = cpu_to_be64(u_size);
1922 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1923 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
e89b591c
PR
1924 p.queue_order_type = cpu_to_be16(q_order_type);
1925 p.dds_flags = cpu_to_be16(flags);
b411b363
PR
1926
1927 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
0b70a13d 1928 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
1929 return ok;
1930}
1931
1932/**
1933 * drbd_send_state() - Sends the drbd state to the peer
1934 * @mdev: DRBD device.
1935 */
1936int drbd_send_state(struct drbd_conf *mdev)
1937{
1938 struct socket *sock;
1939 struct p_state p;
1940 int ok = 0;
1941
1942 /* Grab state lock so we wont send state if we're in the middle
1943 * of a cluster wide state change on another thread */
1944 drbd_state_lock(mdev);
1945
1946 mutex_lock(&mdev->data.mutex);
1947
1948 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1949 sock = mdev->data.socket;
1950
1951 if (likely(sock != NULL)) {
1952 ok = _drbd_send_cmd(mdev, sock, P_STATE,
0b70a13d 1953 (struct p_header80 *)&p, sizeof(p), 0);
b411b363
PR
1954 }
1955
1956 mutex_unlock(&mdev->data.mutex);
1957
1958 drbd_state_unlock(mdev);
1959 return ok;
1960}
1961
1962int drbd_send_state_req(struct drbd_conf *mdev,
1963 union drbd_state mask, union drbd_state val)
1964{
1965 struct p_req_state p;
1966
1967 p.mask = cpu_to_be32(mask.i);
1968 p.val = cpu_to_be32(val.i);
1969
1970 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
0b70a13d 1971 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
1972}
1973
1974int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1975{
1976 struct p_req_state_reply p;
1977
1978 p.retcode = cpu_to_be32(retcode);
1979
1980 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
0b70a13d 1981 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
1982}
1983
1984int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1985 struct p_compressed_bm *p,
1986 struct bm_xfer_ctx *c)
1987{
1988 struct bitstream bs;
1989 unsigned long plain_bits;
1990 unsigned long tmp;
1991 unsigned long rl;
1992 unsigned len;
1993 unsigned toggle;
1994 int bits;
1995
1996 /* may we use this feature? */
1997 if ((mdev->sync_conf.use_rle == 0) ||
1998 (mdev->agreed_pro_version < 90))
1999 return 0;
2000
2001 if (c->bit_offset >= c->bm_bits)
2002 return 0; /* nothing to do. */
2003
2004 /* use at most thus many bytes */
2005 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2006 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2007 /* plain bits covered in this code string */
2008 plain_bits = 0;
2009
2010 /* p->encoding & 0x80 stores whether the first run length is set.
2011 * bit offset is implicit.
2012 * start with toggle == 2 to be able to tell the first iteration */
2013 toggle = 2;
2014
2015 /* see how much plain bits we can stuff into one packet
2016 * using RLE and VLI. */
2017 do {
2018 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2019 : _drbd_bm_find_next(mdev, c->bit_offset);
2020 if (tmp == -1UL)
2021 tmp = c->bm_bits;
2022 rl = tmp - c->bit_offset;
2023
2024 if (toggle == 2) { /* first iteration */
2025 if (rl == 0) {
2026 /* the first checked bit was set,
2027 * store start value, */
2028 DCBP_set_start(p, 1);
2029 /* but skip encoding of zero run length */
2030 toggle = !toggle;
2031 continue;
2032 }
2033 DCBP_set_start(p, 0);
2034 }
2035
2036 /* paranoia: catch zero runlength.
2037 * can only happen if bitmap is modified while we scan it. */
2038 if (rl == 0) {
2039 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2040 "t:%u bo:%lu\n", toggle, c->bit_offset);
2041 return -1;
2042 }
2043
2044 bits = vli_encode_bits(&bs, rl);
2045 if (bits == -ENOBUFS) /* buffer full */
2046 break;
2047 if (bits <= 0) {
2048 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2049 return 0;
2050 }
2051
2052 toggle = !toggle;
2053 plain_bits += rl;
2054 c->bit_offset = tmp;
2055 } while (c->bit_offset < c->bm_bits);
2056
2057 len = bs.cur.b - p->code + !!bs.cur.bit;
2058
2059 if (plain_bits < (len << 3)) {
2060 /* incompressible with this method.
2061 * we need to rewind both word and bit position. */
2062 c->bit_offset -= plain_bits;
2063 bm_xfer_ctx_bit_to_word_offset(c);
2064 c->bit_offset = c->word_offset * BITS_PER_LONG;
2065 return 0;
2066 }
2067
2068 /* RLE + VLI was able to compress it just fine.
2069 * update c->word_offset. */
2070 bm_xfer_ctx_bit_to_word_offset(c);
2071
2072 /* store pad_bits */
2073 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2074
2075 return len;
2076}
2077
2078enum { OK, FAILED, DONE }
2079send_bitmap_rle_or_plain(struct drbd_conf *mdev,
0b70a13d 2080 struct p_header80 *h, struct bm_xfer_ctx *c)
b411b363
PR
2081{
2082 struct p_compressed_bm *p = (void*)h;
2083 unsigned long num_words;
2084 int len;
2085 int ok;
2086
2087 len = fill_bitmap_rle_bits(mdev, p, c);
2088
2089 if (len < 0)
2090 return FAILED;
2091
2092 if (len) {
2093 DCBP_set_code(p, RLE_VLI_Bits);
2094 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2095 sizeof(*p) + len, 0);
2096
2097 c->packets[0]++;
2098 c->bytes[0] += sizeof(*p) + len;
2099
2100 if (c->bit_offset >= c->bm_bits)
2101 len = 0; /* DONE */
2102 } else {
2103 /* was not compressible.
2104 * send a buffer full of plain text bits instead. */
2105 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2106 len = num_words * sizeof(long);
2107 if (len)
2108 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2109 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
0b70a13d 2110 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
2111 c->word_offset += num_words;
2112 c->bit_offset = c->word_offset * BITS_PER_LONG;
2113
2114 c->packets[1]++;
0b70a13d 2115 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
2116
2117 if (c->bit_offset > c->bm_bits)
2118 c->bit_offset = c->bm_bits;
2119 }
2120 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2121
2122 if (ok == DONE)
2123 INFO_bm_xfer_stats(mdev, "send", c);
2124 return ok;
2125}
2126
2127/* See the comment at receive_bitmap() */
2128int _drbd_send_bitmap(struct drbd_conf *mdev)
2129{
2130 struct bm_xfer_ctx c;
0b70a13d 2131 struct p_header80 *p;
b411b363
PR
2132 int ret;
2133
2134 ERR_IF(!mdev->bitmap) return FALSE;
2135
2136 /* maybe we should use some per thread scratch page,
2137 * and allocate that during initial device creation? */
0b70a13d 2138 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
b411b363
PR
2139 if (!p) {
2140 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2141 return FALSE;
2142 }
2143
2144 if (get_ldev(mdev)) {
2145 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2146 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2147 drbd_bm_set_all(mdev);
2148 if (drbd_bm_write(mdev)) {
2149 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2150 * but otherwise process as per normal - need to tell other
2151 * side that a full resync is required! */
2152 dev_err(DEV, "Failed to write bitmap to disk!\n");
2153 } else {
2154 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2155 drbd_md_sync(mdev);
2156 }
2157 }
2158 put_ldev(mdev);
2159 }
2160
2161 c = (struct bm_xfer_ctx) {
2162 .bm_bits = drbd_bm_bits(mdev),
2163 .bm_words = drbd_bm_words(mdev),
2164 };
2165
2166 do {
2167 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2168 } while (ret == OK);
2169
2170 free_page((unsigned long) p);
2171 return (ret == DONE);
2172}
2173
2174int drbd_send_bitmap(struct drbd_conf *mdev)
2175{
2176 int err;
2177
2178 if (!drbd_get_data_sock(mdev))
2179 return -1;
2180 err = !_drbd_send_bitmap(mdev);
2181 drbd_put_data_sock(mdev);
2182 return err;
2183}
2184
2185int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2186{
2187 int ok;
2188 struct p_barrier_ack p;
2189
2190 p.barrier = barrier_nr;
2191 p.set_size = cpu_to_be32(set_size);
2192
2193 if (mdev->state.conn < C_CONNECTED)
2194 return FALSE;
2195 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
0b70a13d 2196 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2197 return ok;
2198}
2199
2200/**
2201 * _drbd_send_ack() - Sends an ack packet
2202 * @mdev: DRBD device.
2203 * @cmd: Packet command code.
2204 * @sector: sector, needs to be in big endian byte order
2205 * @blksize: size in byte, needs to be in big endian byte order
2206 * @block_id: Id, big endian byte order
2207 */
2208static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2209 u64 sector,
2210 u32 blksize,
2211 u64 block_id)
2212{
2213 int ok;
2214 struct p_block_ack p;
2215
2216 p.sector = sector;
2217 p.block_id = block_id;
2218 p.blksize = blksize;
2219 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2220
2221 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2222 return FALSE;
2223 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
0b70a13d 2224 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2225 return ok;
2226}
2227
2b2bf214
LE
2228/* dp->sector and dp->block_id already/still in network byte order,
2229 * data_size is payload size according to dp->head,
2230 * and may need to be corrected for digest size. */
b411b363 2231int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2b2bf214 2232 struct p_data *dp, int data_size)
b411b363 2233{
2b2bf214
LE
2234 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2235 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
b411b363
PR
2236 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2237 dp->block_id);
2238}
2239
2240int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2241 struct p_block_req *rp)
2242{
2243 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2244}
2245
2246/**
2247 * drbd_send_ack() - Sends an ack packet
2248 * @mdev: DRBD device.
2249 * @cmd: Packet command code.
2250 * @e: Epoch entry.
2251 */
2252int drbd_send_ack(struct drbd_conf *mdev,
2253 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2254{
2255 return _drbd_send_ack(mdev, cmd,
2256 cpu_to_be64(e->sector),
2257 cpu_to_be32(e->size),
2258 e->block_id);
2259}
2260
2261/* This function misuses the block_id field to signal if the blocks
2262 * are is sync or not. */
2263int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2264 sector_t sector, int blksize, u64 block_id)
2265{
2266 return _drbd_send_ack(mdev, cmd,
2267 cpu_to_be64(sector),
2268 cpu_to_be32(blksize),
2269 cpu_to_be64(block_id));
2270}
2271
2272int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2273 sector_t sector, int size, u64 block_id)
2274{
2275 int ok;
2276 struct p_block_req p;
2277
2278 p.sector = cpu_to_be64(sector);
2279 p.block_id = block_id;
2280 p.blksize = cpu_to_be32(size);
2281
2282 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
0b70a13d 2283 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2284 return ok;
2285}
2286
2287int drbd_send_drequest_csum(struct drbd_conf *mdev,
2288 sector_t sector, int size,
2289 void *digest, int digest_size,
2290 enum drbd_packets cmd)
2291{
2292 int ok;
2293 struct p_block_req p;
2294
2295 p.sector = cpu_to_be64(sector);
2296 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2297 p.blksize = cpu_to_be32(size);
2298
2299 p.head.magic = BE_DRBD_MAGIC;
2300 p.head.command = cpu_to_be16(cmd);
0b70a13d 2301 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
b411b363
PR
2302
2303 mutex_lock(&mdev->data.mutex);
2304
2305 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2306 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2307
2308 mutex_unlock(&mdev->data.mutex);
2309
2310 return ok;
2311}
2312
2313int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2314{
2315 int ok;
2316 struct p_block_req p;
2317
2318 p.sector = cpu_to_be64(sector);
2319 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2320 p.blksize = cpu_to_be32(size);
2321
2322 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
0b70a13d 2323 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2324 return ok;
2325}
2326
2327/* called on sndtimeo
2328 * returns FALSE if we should retry,
2329 * TRUE if we think connection is dead
2330 */
2331static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2332{
2333 int drop_it;
2334 /* long elapsed = (long)(jiffies - mdev->last_received); */
2335
2336 drop_it = mdev->meta.socket == sock
2337 || !mdev->asender.task
2338 || get_t_state(&mdev->asender) != Running
2339 || mdev->state.conn < C_CONNECTED;
2340
2341 if (drop_it)
2342 return TRUE;
2343
2344 drop_it = !--mdev->ko_count;
2345 if (!drop_it) {
2346 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2347 current->comm, current->pid, mdev->ko_count);
2348 request_ping(mdev);
2349 }
2350
2351 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2352}
2353
2354/* The idea of sendpage seems to be to put some kind of reference
2355 * to the page into the skb, and to hand it over to the NIC. In
2356 * this process get_page() gets called.
2357 *
2358 * As soon as the page was really sent over the network put_page()
2359 * gets called by some part of the network layer. [ NIC driver? ]
2360 *
2361 * [ get_page() / put_page() increment/decrement the count. If count
2362 * reaches 0 the page will be freed. ]
2363 *
2364 * This works nicely with pages from FSs.
2365 * But this means that in protocol A we might signal IO completion too early!
2366 *
2367 * In order not to corrupt data during a resync we must make sure
2368 * that we do not reuse our own buffer pages (EEs) to early, therefore
2369 * we have the net_ee list.
2370 *
2371 * XFS seems to have problems, still, it submits pages with page_count == 0!
2372 * As a workaround, we disable sendpage on pages
2373 * with page_count == 0 or PageSlab.
2374 */
2375static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2376 int offset, size_t size, unsigned msg_flags)
b411b363 2377{
ba11ad9a 2378 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
2379 kunmap(page);
2380 if (sent == size)
2381 mdev->send_cnt += size>>9;
2382 return sent == size;
2383}
2384
2385static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2386 int offset, size_t size, unsigned msg_flags)
b411b363
PR
2387{
2388 mm_segment_t oldfs = get_fs();
2389 int sent, ok;
2390 int len = size;
2391
2392 /* e.g. XFS meta- & log-data is in slab pages, which have a
2393 * page_count of 0 and/or have PageSlab() set.
2394 * we cannot use send_page for those, as that does get_page();
2395 * put_page(); and would cause either a VM_BUG directly, or
2396 * __page_cache_release a page that would actually still be referenced
2397 * by someone, leading to some obscure delayed Oops somewhere else. */
2398 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 2399 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 2400
ba11ad9a 2401 msg_flags |= MSG_NOSIGNAL;
b411b363
PR
2402 drbd_update_congested(mdev);
2403 set_fs(KERNEL_DS);
2404 do {
2405 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2406 offset, len,
ba11ad9a 2407 msg_flags);
b411b363
PR
2408 if (sent == -EAGAIN) {
2409 if (we_should_drop_the_connection(mdev,
2410 mdev->data.socket))
2411 break;
2412 else
2413 continue;
2414 }
2415 if (sent <= 0) {
2416 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2417 __func__, (int)size, len, sent);
2418 break;
2419 }
2420 len -= sent;
2421 offset += sent;
2422 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2423 set_fs(oldfs);
2424 clear_bit(NET_CONGESTED, &mdev->flags);
2425
2426 ok = (len == 0);
2427 if (likely(ok))
2428 mdev->send_cnt += size>>9;
2429 return ok;
2430}
2431
2432static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2433{
2434 struct bio_vec *bvec;
2435 int i;
ba11ad9a 2436 /* hint all but last page with MSG_MORE */
b411b363
PR
2437 __bio_for_each_segment(bvec, bio, i, 0) {
2438 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2439 bvec->bv_offset, bvec->bv_len,
2440 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2441 return 0;
2442 }
2443 return 1;
2444}
2445
2446static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2447{
2448 struct bio_vec *bvec;
2449 int i;
ba11ad9a 2450 /* hint all but last page with MSG_MORE */
b411b363
PR
2451 __bio_for_each_segment(bvec, bio, i, 0) {
2452 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2453 bvec->bv_offset, bvec->bv_len,
2454 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2455 return 0;
2456 }
b411b363
PR
2457 return 1;
2458}
2459
45bb912b
LE
2460static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2461{
2462 struct page *page = e->pages;
2463 unsigned len = e->size;
ba11ad9a 2464 /* hint all but last page with MSG_MORE */
45bb912b
LE
2465 page_chain_for_each(page) {
2466 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
2467 if (!_drbd_send_page(mdev, page, 0, l,
2468 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
2469 return 0;
2470 len -= l;
2471 }
2472 return 1;
2473}
2474
76d2e7ec
PR
2475static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2476{
2477 if (mdev->agreed_pro_version >= 95)
2478 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2479 (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
2480 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2481 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2482 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2483 else
2484 return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
2485}
2486
b411b363
PR
2487/* Used to send write requests
2488 * R_PRIMARY -> Peer (P_DATA)
2489 */
2490int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2491{
2492 int ok = 1;
2493 struct p_data p;
2494 unsigned int dp_flags = 0;
2495 void *dgb;
2496 int dgs;
2497
2498 if (!drbd_get_data_sock(mdev))
2499 return 0;
2500
2501 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2502 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2503
d5373389 2504 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
0b70a13d
PR
2505 p.head.h80.magic = BE_DRBD_MAGIC;
2506 p.head.h80.command = cpu_to_be16(P_DATA);
2507 p.head.h80.length =
2508 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2509 } else {
2510 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2511 p.head.h95.command = cpu_to_be16(P_DATA);
2512 p.head.h95.length =
2513 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2514 }
b411b363
PR
2515
2516 p.sector = cpu_to_be64(req->sector);
2517 p.block_id = (unsigned long)req;
2518 p.seq_num = cpu_to_be32(req->seq_num =
2519 atomic_add_return(1, &mdev->packet_seq));
b411b363 2520
76d2e7ec
PR
2521 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2522
b411b363
PR
2523 if (mdev->state.conn >= C_SYNC_SOURCE &&
2524 mdev->state.conn <= C_PAUSED_SYNC_T)
2525 dp_flags |= DP_MAY_SET_IN_SYNC;
2526
2527 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
2528 set_bit(UNPLUG_REMOTE, &mdev->flags);
2529 ok = (sizeof(p) ==
ba11ad9a 2530 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363
PR
2531 if (ok && dgs) {
2532 dgb = mdev->int_dig_out;
45bb912b 2533 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
ba11ad9a 2534 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2535 }
2536 if (ok) {
2537 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2538 ok = _drbd_send_bio(mdev, req->master_bio);
2539 else
2540 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2541 }
2542
2543 drbd_put_data_sock(mdev);
bd26bfc5 2544
b411b363
PR
2545 return ok;
2546}
2547
2548/* answer packet, used to send data back for read requests:
2549 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2550 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2551 */
2552int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2553 struct drbd_epoch_entry *e)
2554{
2555 int ok;
2556 struct p_data p;
2557 void *dgb;
2558 int dgs;
2559
2560 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2561 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2562
d5373389 2563 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
0b70a13d
PR
2564 p.head.h80.magic = BE_DRBD_MAGIC;
2565 p.head.h80.command = cpu_to_be16(cmd);
2566 p.head.h80.length =
2567 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2568 } else {
2569 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2570 p.head.h95.command = cpu_to_be16(cmd);
2571 p.head.h95.length =
2572 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2573 }
b411b363
PR
2574
2575 p.sector = cpu_to_be64(e->sector);
2576 p.block_id = e->block_id;
2577 /* p.seq_num = 0; No sequence numbers here.. */
2578
2579 /* Only called by our kernel thread.
2580 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2581 * in response to admin command or module unload.
2582 */
2583 if (!drbd_get_data_sock(mdev))
2584 return 0;
2585
0b70a13d 2586 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363
PR
2587 if (ok && dgs) {
2588 dgb = mdev->int_dig_out;
45bb912b 2589 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
ba11ad9a 2590 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2591 }
2592 if (ok)
45bb912b 2593 ok = _drbd_send_zc_ee(mdev, e);
b411b363
PR
2594
2595 drbd_put_data_sock(mdev);
bd26bfc5 2596
b411b363
PR
2597 return ok;
2598}
2599
2600/*
2601 drbd_send distinguishes two cases:
2602
2603 Packets sent via the data socket "sock"
2604 and packets sent via the meta data socket "msock"
2605
2606 sock msock
2607 -----------------+-------------------------+------------------------------
2608 timeout conf.timeout / 2 conf.timeout / 2
2609 timeout action send a ping via msock Abort communication
2610 and close all sockets
2611*/
2612
2613/*
2614 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2615 */
2616int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2617 void *buf, size_t size, unsigned msg_flags)
2618{
2619 struct kvec iov;
2620 struct msghdr msg;
2621 int rv, sent = 0;
2622
2623 if (!sock)
2624 return -1000;
2625
2626 /* THINK if (signal_pending) return ... ? */
2627
2628 iov.iov_base = buf;
2629 iov.iov_len = size;
2630
2631 msg.msg_name = NULL;
2632 msg.msg_namelen = 0;
2633 msg.msg_control = NULL;
2634 msg.msg_controllen = 0;
2635 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2636
2637 if (sock == mdev->data.socket) {
2638 mdev->ko_count = mdev->net_conf->ko_count;
2639 drbd_update_congested(mdev);
2640 }
2641 do {
2642 /* STRANGE
2643 * tcp_sendmsg does _not_ use its size parameter at all ?
2644 *
2645 * -EAGAIN on timeout, -EINTR on signal.
2646 */
2647/* THINK
2648 * do we need to block DRBD_SIG if sock == &meta.socket ??
2649 * otherwise wake_asender() might interrupt some send_*Ack !
2650 */
2651 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2652 if (rv == -EAGAIN) {
2653 if (we_should_drop_the_connection(mdev, sock))
2654 break;
2655 else
2656 continue;
2657 }
2658 D_ASSERT(rv != 0);
2659 if (rv == -EINTR) {
2660 flush_signals(current);
2661 rv = 0;
2662 }
2663 if (rv < 0)
2664 break;
2665 sent += rv;
2666 iov.iov_base += rv;
2667 iov.iov_len -= rv;
2668 } while (sent < size);
2669
2670 if (sock == mdev->data.socket)
2671 clear_bit(NET_CONGESTED, &mdev->flags);
2672
2673 if (rv <= 0) {
2674 if (rv != -EAGAIN) {
2675 dev_err(DEV, "%s_sendmsg returned %d\n",
2676 sock == mdev->meta.socket ? "msock" : "sock",
2677 rv);
2678 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2679 } else
2680 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2681 }
2682
2683 return sent;
2684}
2685
2686static int drbd_open(struct block_device *bdev, fmode_t mode)
2687{
2688 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2689 unsigned long flags;
2690 int rv = 0;
2691
6e9624b8 2692 lock_kernel();
b411b363
PR
2693 spin_lock_irqsave(&mdev->req_lock, flags);
2694 /* to have a stable mdev->state.role
2695 * and no race with updating open_cnt */
2696
2697 if (mdev->state.role != R_PRIMARY) {
2698 if (mode & FMODE_WRITE)
2699 rv = -EROFS;
2700 else if (!allow_oos)
2701 rv = -EMEDIUMTYPE;
2702 }
2703
2704 if (!rv)
2705 mdev->open_cnt++;
2706 spin_unlock_irqrestore(&mdev->req_lock, flags);
6e9624b8 2707 unlock_kernel();
b411b363
PR
2708
2709 return rv;
2710}
2711
2712static int drbd_release(struct gendisk *gd, fmode_t mode)
2713{
2714 struct drbd_conf *mdev = gd->private_data;
6e9624b8 2715 lock_kernel();
b411b363 2716 mdev->open_cnt--;
6e9624b8 2717 unlock_kernel();
b411b363
PR
2718 return 0;
2719}
2720
2721static void drbd_unplug_fn(struct request_queue *q)
2722{
2723 struct drbd_conf *mdev = q->queuedata;
2724
b411b363
PR
2725 /* unplug FIRST */
2726 spin_lock_irq(q->queue_lock);
2727 blk_remove_plug(q);
2728 spin_unlock_irq(q->queue_lock);
2729
2730 /* only if connected */
2731 spin_lock_irq(&mdev->req_lock);
2732 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2733 D_ASSERT(mdev->state.role == R_PRIMARY);
2734 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2735 /* add to the data.work queue,
2736 * unless already queued.
2737 * XXX this might be a good addition to drbd_queue_work
2738 * anyways, to detect "double queuing" ... */
2739 if (list_empty(&mdev->unplug_work.list))
2740 drbd_queue_work(&mdev->data.work,
2741 &mdev->unplug_work);
2742 }
2743 }
2744 spin_unlock_irq(&mdev->req_lock);
2745
2746 if (mdev->state.disk >= D_INCONSISTENT)
2747 drbd_kick_lo(mdev);
2748}
2749
2750static void drbd_set_defaults(struct drbd_conf *mdev)
2751{
85f4cc17
PR
2752 /* This way we get a compile error when sync_conf grows,
2753 and we forgot to initialize it here */
2754 mdev->sync_conf = (struct syncer_conf) {
2755 /* .rate = */ DRBD_RATE_DEF,
2756 /* .after = */ DRBD_AFTER_DEF,
2757 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
85f4cc17
PR
2758 /* .verify_alg = */ {}, 0,
2759 /* .cpu_mask = */ {}, 0,
2760 /* .csums_alg = */ {}, 0,
e756414f 2761 /* .use_rle = */ 0,
9a31d716
PR
2762 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2763 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2764 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2765 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
0f0601f4
LE
2766 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2767 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
85f4cc17
PR
2768 };
2769
2770 /* Have to use that way, because the layout differs between
2771 big endian and little endian */
b411b363
PR
2772 mdev->state = (union drbd_state) {
2773 { .role = R_SECONDARY,
2774 .peer = R_UNKNOWN,
2775 .conn = C_STANDALONE,
2776 .disk = D_DISKLESS,
2777 .pdsk = D_UNKNOWN,
fb22c402
PR
2778 .susp = 0,
2779 .susp_nod = 0,
2780 .susp_fen = 0
b411b363
PR
2781 } };
2782}
2783
2784void drbd_init_set_defaults(struct drbd_conf *mdev)
2785{
2786 /* the memset(,0,) did most of this.
2787 * note: only assignments, no allocation in here */
2788
2789 drbd_set_defaults(mdev);
2790
2791 /* for now, we do NOT yet support it,
2792 * even though we start some framework
2793 * to eventually support barriers */
2794 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2795
2796 atomic_set(&mdev->ap_bio_cnt, 0);
2797 atomic_set(&mdev->ap_pending_cnt, 0);
2798 atomic_set(&mdev->rs_pending_cnt, 0);
2799 atomic_set(&mdev->unacked_cnt, 0);
2800 atomic_set(&mdev->local_cnt, 0);
2801 atomic_set(&mdev->net_cnt, 0);
2802 atomic_set(&mdev->packet_seq, 0);
2803 atomic_set(&mdev->pp_in_use, 0);
435f0740 2804 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 2805 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 2806 atomic_set(&mdev->rs_sect_ev, 0);
b411b363
PR
2807
2808 mutex_init(&mdev->md_io_mutex);
2809 mutex_init(&mdev->data.mutex);
2810 mutex_init(&mdev->meta.mutex);
2811 sema_init(&mdev->data.work.s, 0);
2812 sema_init(&mdev->meta.work.s, 0);
2813 mutex_init(&mdev->state_mutex);
2814
2815 spin_lock_init(&mdev->data.work.q_lock);
2816 spin_lock_init(&mdev->meta.work.q_lock);
2817
2818 spin_lock_init(&mdev->al_lock);
2819 spin_lock_init(&mdev->req_lock);
2820 spin_lock_init(&mdev->peer_seq_lock);
2821 spin_lock_init(&mdev->epoch_lock);
2822
2823 INIT_LIST_HEAD(&mdev->active_ee);
2824 INIT_LIST_HEAD(&mdev->sync_ee);
2825 INIT_LIST_HEAD(&mdev->done_ee);
2826 INIT_LIST_HEAD(&mdev->read_ee);
2827 INIT_LIST_HEAD(&mdev->net_ee);
2828 INIT_LIST_HEAD(&mdev->resync_reads);
2829 INIT_LIST_HEAD(&mdev->data.work.q);
2830 INIT_LIST_HEAD(&mdev->meta.work.q);
2831 INIT_LIST_HEAD(&mdev->resync_work.list);
2832 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 2833 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363
PR
2834 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2835 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 2836
b411b363
PR
2837 mdev->resync_work.cb = w_resync_inactive;
2838 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 2839 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
2840 mdev->md_sync_work.cb = w_md_sync;
2841 mdev->bm_io_work.w.cb = w_bitmap_io;
2842 init_timer(&mdev->resync_timer);
2843 init_timer(&mdev->md_sync_timer);
2844 mdev->resync_timer.function = resync_timer_fn;
2845 mdev->resync_timer.data = (unsigned long) mdev;
2846 mdev->md_sync_timer.function = md_sync_timer_fn;
2847 mdev->md_sync_timer.data = (unsigned long) mdev;
2848
2849 init_waitqueue_head(&mdev->misc_wait);
2850 init_waitqueue_head(&mdev->state_wait);
84dfb9f5 2851 init_waitqueue_head(&mdev->net_cnt_wait);
b411b363
PR
2852 init_waitqueue_head(&mdev->ee_wait);
2853 init_waitqueue_head(&mdev->al_wait);
2854 init_waitqueue_head(&mdev->seq_wait);
2855
2856 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2857 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2858 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2859
2860 mdev->agreed_pro_version = PRO_VERSION_MAX;
2451fc3b 2861 mdev->write_ordering = WO_bdev_flush;
b411b363
PR
2862 mdev->resync_wenr = LC_FREE;
2863}
2864
2865void drbd_mdev_cleanup(struct drbd_conf *mdev)
2866{
1d7734a0 2867 int i;
b411b363
PR
2868 if (mdev->receiver.t_state != None)
2869 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2870 mdev->receiver.t_state);
2871
2872 /* no need to lock it, I'm the only thread alive */
2873 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2874 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2875 mdev->al_writ_cnt =
2876 mdev->bm_writ_cnt =
2877 mdev->read_cnt =
2878 mdev->recv_cnt =
2879 mdev->send_cnt =
2880 mdev->writ_cnt =
2881 mdev->p_size =
2882 mdev->rs_start =
2883 mdev->rs_total =
1d7734a0
LE
2884 mdev->rs_failed = 0;
2885 mdev->rs_last_events = 0;
0f0601f4 2886 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
2887 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2888 mdev->rs_mark_left[i] = 0;
2889 mdev->rs_mark_time[i] = 0;
2890 }
b411b363
PR
2891 D_ASSERT(mdev->net_conf == NULL);
2892
2893 drbd_set_my_capacity(mdev, 0);
2894 if (mdev->bitmap) {
2895 /* maybe never allocated. */
02d9a94b 2896 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
2897 drbd_bm_cleanup(mdev);
2898 }
2899
2900 drbd_free_resources(mdev);
0778286a 2901 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
2902
2903 /*
2904 * currently we drbd_init_ee only on module load, so
2905 * we may do drbd_release_ee only on module unload!
2906 */
2907 D_ASSERT(list_empty(&mdev->active_ee));
2908 D_ASSERT(list_empty(&mdev->sync_ee));
2909 D_ASSERT(list_empty(&mdev->done_ee));
2910 D_ASSERT(list_empty(&mdev->read_ee));
2911 D_ASSERT(list_empty(&mdev->net_ee));
2912 D_ASSERT(list_empty(&mdev->resync_reads));
2913 D_ASSERT(list_empty(&mdev->data.work.q));
2914 D_ASSERT(list_empty(&mdev->meta.work.q));
2915 D_ASSERT(list_empty(&mdev->resync_work.list));
2916 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 2917 D_ASSERT(list_empty(&mdev->go_diskless.list));
b411b363
PR
2918}
2919
2920
2921static void drbd_destroy_mempools(void)
2922{
2923 struct page *page;
2924
2925 while (drbd_pp_pool) {
2926 page = drbd_pp_pool;
2927 drbd_pp_pool = (struct page *)page_private(page);
2928 __free_page(page);
2929 drbd_pp_vacant--;
2930 }
2931
2932 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2933
2934 if (drbd_ee_mempool)
2935 mempool_destroy(drbd_ee_mempool);
2936 if (drbd_request_mempool)
2937 mempool_destroy(drbd_request_mempool);
2938 if (drbd_ee_cache)
2939 kmem_cache_destroy(drbd_ee_cache);
2940 if (drbd_request_cache)
2941 kmem_cache_destroy(drbd_request_cache);
2942 if (drbd_bm_ext_cache)
2943 kmem_cache_destroy(drbd_bm_ext_cache);
2944 if (drbd_al_ext_cache)
2945 kmem_cache_destroy(drbd_al_ext_cache);
2946
2947 drbd_ee_mempool = NULL;
2948 drbd_request_mempool = NULL;
2949 drbd_ee_cache = NULL;
2950 drbd_request_cache = NULL;
2951 drbd_bm_ext_cache = NULL;
2952 drbd_al_ext_cache = NULL;
2953
2954 return;
2955}
2956
2957static int drbd_create_mempools(void)
2958{
2959 struct page *page;
2960 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2961 int i;
2962
2963 /* prepare our caches and mempools */
2964 drbd_request_mempool = NULL;
2965 drbd_ee_cache = NULL;
2966 drbd_request_cache = NULL;
2967 drbd_bm_ext_cache = NULL;
2968 drbd_al_ext_cache = NULL;
2969 drbd_pp_pool = NULL;
2970
2971 /* caches */
2972 drbd_request_cache = kmem_cache_create(
2973 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2974 if (drbd_request_cache == NULL)
2975 goto Enomem;
2976
2977 drbd_ee_cache = kmem_cache_create(
2978 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2979 if (drbd_ee_cache == NULL)
2980 goto Enomem;
2981
2982 drbd_bm_ext_cache = kmem_cache_create(
2983 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2984 if (drbd_bm_ext_cache == NULL)
2985 goto Enomem;
2986
2987 drbd_al_ext_cache = kmem_cache_create(
2988 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2989 if (drbd_al_ext_cache == NULL)
2990 goto Enomem;
2991
2992 /* mempools */
2993 drbd_request_mempool = mempool_create(number,
2994 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2995 if (drbd_request_mempool == NULL)
2996 goto Enomem;
2997
2998 drbd_ee_mempool = mempool_create(number,
2999 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3000 if (drbd_request_mempool == NULL)
3001 goto Enomem;
3002
3003 /* drbd's page pool */
3004 spin_lock_init(&drbd_pp_lock);
3005
3006 for (i = 0; i < number; i++) {
3007 page = alloc_page(GFP_HIGHUSER);
3008 if (!page)
3009 goto Enomem;
3010 set_page_private(page, (unsigned long)drbd_pp_pool);
3011 drbd_pp_pool = page;
3012 }
3013 drbd_pp_vacant = number;
3014
3015 return 0;
3016
3017Enomem:
3018 drbd_destroy_mempools(); /* in case we allocated some */
3019 return -ENOMEM;
3020}
3021
3022static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3023 void *unused)
3024{
3025 /* just so we have it. you never know what interesting things we
3026 * might want to do here some day...
3027 */
3028
3029 return NOTIFY_DONE;
3030}
3031
3032static struct notifier_block drbd_notifier = {
3033 .notifier_call = drbd_notify_sys,
3034};
3035
3036static void drbd_release_ee_lists(struct drbd_conf *mdev)
3037{
3038 int rr;
3039
3040 rr = drbd_release_ee(mdev, &mdev->active_ee);
3041 if (rr)
3042 dev_err(DEV, "%d EEs in active list found!\n", rr);
3043
3044 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3045 if (rr)
3046 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3047
3048 rr = drbd_release_ee(mdev, &mdev->read_ee);
3049 if (rr)
3050 dev_err(DEV, "%d EEs in read list found!\n", rr);
3051
3052 rr = drbd_release_ee(mdev, &mdev->done_ee);
3053 if (rr)
3054 dev_err(DEV, "%d EEs in done list found!\n", rr);
3055
3056 rr = drbd_release_ee(mdev, &mdev->net_ee);
3057 if (rr)
3058 dev_err(DEV, "%d EEs in net list found!\n", rr);
3059}
3060
3061/* caution. no locking.
3062 * currently only used from module cleanup code. */
3063static void drbd_delete_device(unsigned int minor)
3064{
3065 struct drbd_conf *mdev = minor_to_mdev(minor);
3066
3067 if (!mdev)
3068 return;
3069
3070 /* paranoia asserts */
3071 if (mdev->open_cnt != 0)
3072 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3073 __FILE__ , __LINE__);
3074
3075 ERR_IF (!list_empty(&mdev->data.work.q)) {
3076 struct list_head *lp;
3077 list_for_each(lp, &mdev->data.work.q) {
3078 dev_err(DEV, "lp = %p\n", lp);
3079 }
3080 };
3081 /* end paranoia asserts */
3082
3083 del_gendisk(mdev->vdisk);
3084
3085 /* cleanup stuff that may have been allocated during
3086 * device (re-)configuration or state changes */
3087
3088 if (mdev->this_bdev)
3089 bdput(mdev->this_bdev);
3090
3091 drbd_free_resources(mdev);
3092
3093 drbd_release_ee_lists(mdev);
3094
3095 /* should be free'd on disconnect? */
3096 kfree(mdev->ee_hash);
3097 /*
3098 mdev->ee_hash_s = 0;
3099 mdev->ee_hash = NULL;
3100 */
3101
3102 lc_destroy(mdev->act_log);
3103 lc_destroy(mdev->resync);
3104
3105 kfree(mdev->p_uuid);
3106 /* mdev->p_uuid = NULL; */
3107
3108 kfree(mdev->int_dig_out);
3109 kfree(mdev->int_dig_in);
3110 kfree(mdev->int_dig_vv);
3111
3112 /* cleanup the rest that has been
3113 * allocated from drbd_new_device
3114 * and actually free the mdev itself */
3115 drbd_free_mdev(mdev);
3116}
3117
3118static void drbd_cleanup(void)
3119{
3120 unsigned int i;
3121
3122 unregister_reboot_notifier(&drbd_notifier);
3123
3124 drbd_nl_cleanup();
3125
3126 if (minor_table) {
3127 if (drbd_proc)
3128 remove_proc_entry("drbd", NULL);
3129 i = minor_count;
3130 while (i--)
3131 drbd_delete_device(i);
3132 drbd_destroy_mempools();
3133 }
3134
3135 kfree(minor_table);
3136
3137 unregister_blkdev(DRBD_MAJOR, "drbd");
3138
3139 printk(KERN_INFO "drbd: module cleanup done.\n");
3140}
3141
3142/**
3143 * drbd_congested() - Callback for pdflush
3144 * @congested_data: User data
3145 * @bdi_bits: Bits pdflush is currently interested in
3146 *
3147 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3148 */
3149static int drbd_congested(void *congested_data, int bdi_bits)
3150{
3151 struct drbd_conf *mdev = congested_data;
3152 struct request_queue *q;
3153 char reason = '-';
3154 int r = 0;
3155
3156 if (!__inc_ap_bio_cond(mdev)) {
3157 /* DRBD has frozen IO */
3158 r = bdi_bits;
3159 reason = 'd';
3160 goto out;
3161 }
3162
3163 if (get_ldev(mdev)) {
3164 q = bdev_get_queue(mdev->ldev->backing_bdev);
3165 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3166 put_ldev(mdev);
3167 if (r)
3168 reason = 'b';
3169 }
3170
3171 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3172 r |= (1 << BDI_async_congested);
3173 reason = reason == 'b' ? 'a' : 'n';
3174 }
3175
3176out:
3177 mdev->congestion_reason = reason;
3178 return r;
3179}
3180
3181struct drbd_conf *drbd_new_device(unsigned int minor)
3182{
3183 struct drbd_conf *mdev;
3184 struct gendisk *disk;
3185 struct request_queue *q;
3186
3187 /* GFP_KERNEL, we are outside of all write-out paths */
3188 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3189 if (!mdev)
3190 return NULL;
3191 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3192 goto out_no_cpumask;
3193
3194 mdev->minor = minor;
3195
3196 drbd_init_set_defaults(mdev);
3197
3198 q = blk_alloc_queue(GFP_KERNEL);
3199 if (!q)
3200 goto out_no_q;
3201 mdev->rq_queue = q;
3202 q->queuedata = mdev;
b411b363
PR
3203
3204 disk = alloc_disk(1);
3205 if (!disk)
3206 goto out_no_disk;
3207 mdev->vdisk = disk;
3208
3209 set_disk_ro(disk, TRUE);
3210
3211 disk->queue = q;
3212 disk->major = DRBD_MAJOR;
3213 disk->first_minor = minor;
3214 disk->fops = &drbd_ops;
3215 sprintf(disk->disk_name, "drbd%d", minor);
3216 disk->private_data = mdev;
3217
3218 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3219 /* we have no partitions. we contain only ourselves. */
3220 mdev->this_bdev->bd_contains = mdev->this_bdev;
3221
3222 q->backing_dev_info.congested_fn = drbd_congested;
3223 q->backing_dev_info.congested_data = mdev;
3224
3225 blk_queue_make_request(q, drbd_make_request_26);
98ec286e 3226 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
b411b363
PR
3227 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3228 blk_queue_merge_bvec(q, drbd_merge_bvec);
3229 q->queue_lock = &mdev->req_lock; /* needed since we use */
3230 /* plugging on a queue, that actually has no requests! */
3231 q->unplug_fn = drbd_unplug_fn;
3232
3233 mdev->md_io_page = alloc_page(GFP_KERNEL);
3234 if (!mdev->md_io_page)
3235 goto out_no_io_page;
3236
3237 if (drbd_bm_init(mdev))
3238 goto out_no_bitmap;
3239 /* no need to lock access, we are still initializing this minor device. */
3240 if (!tl_init(mdev))
3241 goto out_no_tl;
3242
3243 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3244 if (!mdev->app_reads_hash)
3245 goto out_no_app_reads;
3246
3247 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3248 if (!mdev->current_epoch)
3249 goto out_no_epoch;
3250
3251 INIT_LIST_HEAD(&mdev->current_epoch->list);
3252 mdev->epochs = 1;
3253
3254 return mdev;
3255
3256/* out_whatever_else:
3257 kfree(mdev->current_epoch); */
3258out_no_epoch:
3259 kfree(mdev->app_reads_hash);
3260out_no_app_reads:
3261 tl_cleanup(mdev);
3262out_no_tl:
3263 drbd_bm_cleanup(mdev);
3264out_no_bitmap:
3265 __free_page(mdev->md_io_page);
3266out_no_io_page:
3267 put_disk(disk);
3268out_no_disk:
3269 blk_cleanup_queue(q);
3270out_no_q:
3271 free_cpumask_var(mdev->cpu_mask);
3272out_no_cpumask:
3273 kfree(mdev);
3274 return NULL;
3275}
3276
3277/* counterpart of drbd_new_device.
3278 * last part of drbd_delete_device. */
3279void drbd_free_mdev(struct drbd_conf *mdev)
3280{
3281 kfree(mdev->current_epoch);
3282 kfree(mdev->app_reads_hash);
3283 tl_cleanup(mdev);
3284 if (mdev->bitmap) /* should no longer be there. */
3285 drbd_bm_cleanup(mdev);
3286 __free_page(mdev->md_io_page);
3287 put_disk(mdev->vdisk);
3288 blk_cleanup_queue(mdev->rq_queue);
3289 free_cpumask_var(mdev->cpu_mask);
3290 kfree(mdev);
3291}
3292
3293
3294int __init drbd_init(void)
3295{
3296 int err;
3297
3298 if (sizeof(struct p_handshake) != 80) {
3299 printk(KERN_ERR
3300 "drbd: never change the size or layout "
3301 "of the HandShake packet.\n");
3302 return -EINVAL;
3303 }
3304
3305 if (1 > minor_count || minor_count > 255) {
3306 printk(KERN_ERR
3307 "drbd: invalid minor_count (%d)\n", minor_count);
3308#ifdef MODULE
3309 return -EINVAL;
3310#else
3311 minor_count = 8;
3312#endif
3313 }
3314
3315 err = drbd_nl_init();
3316 if (err)
3317 return err;
3318
3319 err = register_blkdev(DRBD_MAJOR, "drbd");
3320 if (err) {
3321 printk(KERN_ERR
3322 "drbd: unable to register block device major %d\n",
3323 DRBD_MAJOR);
3324 return err;
3325 }
3326
3327 register_reboot_notifier(&drbd_notifier);
3328
3329 /*
3330 * allocate all necessary structs
3331 */
3332 err = -ENOMEM;
3333
3334 init_waitqueue_head(&drbd_pp_wait);
3335
3336 drbd_proc = NULL; /* play safe for drbd_cleanup */
3337 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3338 GFP_KERNEL);
3339 if (!minor_table)
3340 goto Enomem;
3341
3342 err = drbd_create_mempools();
3343 if (err)
3344 goto Enomem;
3345
8c484ee4 3346 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
3347 if (!drbd_proc) {
3348 printk(KERN_ERR "drbd: unable to register proc file\n");
3349 goto Enomem;
3350 }
3351
3352 rwlock_init(&global_state_lock);
3353
3354 printk(KERN_INFO "drbd: initialized. "
3355 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3356 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3357 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3358 printk(KERN_INFO "drbd: registered as block device major %d\n",
3359 DRBD_MAJOR);
3360 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3361
3362 return 0; /* Success! */
3363
3364Enomem:
3365 drbd_cleanup();
3366 if (err == -ENOMEM)
3367 /* currently always the case */
3368 printk(KERN_ERR "drbd: ran out of memory\n");
3369 else
3370 printk(KERN_ERR "drbd: initialization failure\n");
3371 return err;
3372}
3373
3374void drbd_free_bc(struct drbd_backing_dev *ldev)
3375{
3376 if (ldev == NULL)
3377 return;
3378
3379 bd_release(ldev->backing_bdev);
3380 bd_release(ldev->md_bdev);
3381
3382 fput(ldev->lo_file);
3383 fput(ldev->md_file);
3384
3385 kfree(ldev);
3386}
3387
3388void drbd_free_sock(struct drbd_conf *mdev)
3389{
3390 if (mdev->data.socket) {
4589d7f8 3391 mutex_lock(&mdev->data.mutex);
b411b363
PR
3392 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3393 sock_release(mdev->data.socket);
3394 mdev->data.socket = NULL;
4589d7f8 3395 mutex_unlock(&mdev->data.mutex);
b411b363
PR
3396 }
3397 if (mdev->meta.socket) {
4589d7f8 3398 mutex_lock(&mdev->meta.mutex);
b411b363
PR
3399 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3400 sock_release(mdev->meta.socket);
3401 mdev->meta.socket = NULL;
4589d7f8 3402 mutex_unlock(&mdev->meta.mutex);
b411b363
PR
3403 }
3404}
3405
3406
3407void drbd_free_resources(struct drbd_conf *mdev)
3408{
3409 crypto_free_hash(mdev->csums_tfm);
3410 mdev->csums_tfm = NULL;
3411 crypto_free_hash(mdev->verify_tfm);
3412 mdev->verify_tfm = NULL;
3413 crypto_free_hash(mdev->cram_hmac_tfm);
3414 mdev->cram_hmac_tfm = NULL;
3415 crypto_free_hash(mdev->integrity_w_tfm);
3416 mdev->integrity_w_tfm = NULL;
3417 crypto_free_hash(mdev->integrity_r_tfm);
3418 mdev->integrity_r_tfm = NULL;
3419
3420 drbd_free_sock(mdev);
3421
3422 __no_warn(local,
3423 drbd_free_bc(mdev->ldev);
3424 mdev->ldev = NULL;);
3425}
3426
3427/* meta data management */
3428
3429struct meta_data_on_disk {
3430 u64 la_size; /* last agreed size. */
3431 u64 uuid[UI_SIZE]; /* UUIDs. */
3432 u64 device_uuid;
3433 u64 reserved_u64_1;
3434 u32 flags; /* MDF */
3435 u32 magic;
3436 u32 md_size_sect;
3437 u32 al_offset; /* offset to this block */
3438 u32 al_nr_extents; /* important for restoring the AL */
3439 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3440 u32 bm_offset; /* offset to the bitmap, from here */
3441 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3442 u32 reserved_u32[4];
3443
3444} __packed;
3445
3446/**
3447 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3448 * @mdev: DRBD device.
3449 */
3450void drbd_md_sync(struct drbd_conf *mdev)
3451{
3452 struct meta_data_on_disk *buffer;
3453 sector_t sector;
3454 int i;
3455
ee15b038
LE
3456 del_timer(&mdev->md_sync_timer);
3457 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
3458 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3459 return;
b411b363
PR
3460
3461 /* We use here D_FAILED and not D_ATTACHING because we try to write
3462 * metadata even if we detach due to a disk failure! */
3463 if (!get_ldev_if_state(mdev, D_FAILED))
3464 return;
3465
b411b363
PR
3466 mutex_lock(&mdev->md_io_mutex);
3467 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3468 memset(buffer, 0, 512);
3469
3470 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3471 for (i = UI_CURRENT; i < UI_SIZE; i++)
3472 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3473 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3474 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3475
3476 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3477 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3478 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3479 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3480 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3481
3482 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3483
3484 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3485 sector = mdev->ldev->md.md_offset;
3486
3f3a9b84 3487 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
3488 /* this was a try anyways ... */
3489 dev_err(DEV, "meta data update failed!\n");
b411b363
PR
3490 drbd_chk_io_error(mdev, 1, TRUE);
3491 }
3492
3493 /* Update mdev->ldev->md.la_size_sect,
3494 * since we updated it on metadata. */
3495 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3496
3497 mutex_unlock(&mdev->md_io_mutex);
3498 put_ldev(mdev);
3499}
3500
3501/**
3502 * drbd_md_read() - Reads in the meta data super block
3503 * @mdev: DRBD device.
3504 * @bdev: Device from which the meta data should be read in.
3505 *
3506 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3507 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3508 */
3509int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3510{
3511 struct meta_data_on_disk *buffer;
3512 int i, rv = NO_ERROR;
3513
3514 if (!get_ldev_if_state(mdev, D_ATTACHING))
3515 return ERR_IO_MD_DISK;
3516
b411b363
PR
3517 mutex_lock(&mdev->md_io_mutex);
3518 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3519
3520 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3521 /* NOTE: cant do normal error processing here as this is
3522 called BEFORE disk is attached */
3523 dev_err(DEV, "Error while reading metadata.\n");
3524 rv = ERR_IO_MD_DISK;
3525 goto err;
3526 }
3527
3528 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3529 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3530 rv = ERR_MD_INVALID;
3531 goto err;
3532 }
3533 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3534 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3535 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3536 rv = ERR_MD_INVALID;
3537 goto err;
3538 }
3539 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3540 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3541 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3542 rv = ERR_MD_INVALID;
3543 goto err;
3544 }
3545 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3546 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3547 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3548 rv = ERR_MD_INVALID;
3549 goto err;
3550 }
3551
3552 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3553 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3554 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3555 rv = ERR_MD_INVALID;
3556 goto err;
3557 }
3558
3559 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3560 for (i = UI_CURRENT; i < UI_SIZE; i++)
3561 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3562 bdev->md.flags = be32_to_cpu(buffer->flags);
3563 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3564 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3565
3566 if (mdev->sync_conf.al_extents < 7)
3567 mdev->sync_conf.al_extents = 127;
3568
3569 err:
3570 mutex_unlock(&mdev->md_io_mutex);
3571 put_ldev(mdev);
3572
3573 return rv;
3574}
3575
ac724121
LE
3576static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3577{
3578 static char *uuid_str[UI_EXTENDED_SIZE] = {
3579 [UI_CURRENT] = "CURRENT",
3580 [UI_BITMAP] = "BITMAP",
3581 [UI_HISTORY_START] = "HISTORY_START",
3582 [UI_HISTORY_END] = "HISTORY_END",
3583 [UI_SIZE] = "SIZE",
3584 [UI_FLAGS] = "FLAGS",
3585 };
3586
3587 if (index >= UI_EXTENDED_SIZE) {
3588 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3589 return;
3590 }
3591
3592 dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3593 uuid_str[index],
3594 (unsigned long long)mdev->ldev->md.uuid[index]);
3595}
3596
3597
b411b363
PR
3598/**
3599 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3600 * @mdev: DRBD device.
3601 *
3602 * Call this function if you change anything that should be written to
3603 * the meta-data super block. This function sets MD_DIRTY, and starts a
3604 * timer that ensures that within five seconds you have to call drbd_md_sync().
3605 */
ca0e6098 3606#ifdef DEBUG
ee15b038
LE
3607void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3608{
3609 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3610 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3611 mdev->last_md_mark_dirty.line = line;
3612 mdev->last_md_mark_dirty.func = func;
3613 }
3614}
3615#else
b411b363
PR
3616void drbd_md_mark_dirty(struct drbd_conf *mdev)
3617{
ee15b038 3618 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 3619 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 3620}
ee15b038 3621#endif
b411b363
PR
3622
3623static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3624{
3625 int i;
3626
ac724121 3627 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
b411b363 3628 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
ac724121
LE
3629 debug_drbd_uuid(mdev, i+1);
3630 }
b411b363
PR
3631}
3632
3633void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3634{
3635 if (idx == UI_CURRENT) {
3636 if (mdev->state.role == R_PRIMARY)
3637 val |= 1;
3638 else
3639 val &= ~((u64)1);
3640
3641 drbd_set_ed_uuid(mdev, val);
3642 }
3643
3644 mdev->ldev->md.uuid[idx] = val;
ac724121 3645 debug_drbd_uuid(mdev, idx);
b411b363
PR
3646 drbd_md_mark_dirty(mdev);
3647}
3648
3649
3650void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3651{
3652 if (mdev->ldev->md.uuid[idx]) {
3653 drbd_uuid_move_history(mdev);
3654 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
ac724121 3655 debug_drbd_uuid(mdev, UI_HISTORY_START);
b411b363
PR
3656 }
3657 _drbd_uuid_set(mdev, idx, val);
3658}
3659
3660/**
3661 * drbd_uuid_new_current() - Creates a new current UUID
3662 * @mdev: DRBD device.
3663 *
3664 * Creates a new current UUID, and rotates the old current UUID into
3665 * the bitmap slot. Causes an incremental resync upon next connect.
3666 */
3667void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3668{
3669 u64 val;
3670
3671 dev_info(DEV, "Creating new current UUID\n");
3672 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3673 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
ac724121 3674 debug_drbd_uuid(mdev, UI_BITMAP);
b411b363
PR
3675
3676 get_random_bytes(&val, sizeof(u64));
3677 _drbd_uuid_set(mdev, UI_CURRENT, val);
aaa8e2b3
LE
3678 /* get it to stable storage _now_ */
3679 drbd_md_sync(mdev);
b411b363
PR
3680}
3681
3682void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3683{
3684 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3685 return;
3686
3687 if (val == 0) {
3688 drbd_uuid_move_history(mdev);
3689 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3690 mdev->ldev->md.uuid[UI_BITMAP] = 0;
ac724121
LE
3691 debug_drbd_uuid(mdev, UI_HISTORY_START);
3692 debug_drbd_uuid(mdev, UI_BITMAP);
b411b363
PR
3693 } else {
3694 if (mdev->ldev->md.uuid[UI_BITMAP])
3695 dev_warn(DEV, "bm UUID already set");
3696
3697 mdev->ldev->md.uuid[UI_BITMAP] = val;
3698 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3699
ac724121 3700 debug_drbd_uuid(mdev, UI_BITMAP);
b411b363
PR
3701 }
3702 drbd_md_mark_dirty(mdev);
3703}
3704
3705/**
3706 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3707 * @mdev: DRBD device.
3708 *
3709 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3710 */
3711int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3712{
3713 int rv = -EIO;
3714
3715 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3716 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3717 drbd_md_sync(mdev);
3718 drbd_bm_set_all(mdev);
3719
3720 rv = drbd_bm_write(mdev);
3721
3722 if (!rv) {
3723 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3724 drbd_md_sync(mdev);
3725 }
3726
3727 put_ldev(mdev);
3728 }
3729
3730 return rv;
3731}
3732
3733/**
3734 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3735 * @mdev: DRBD device.
3736 *
3737 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3738 */
3739int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3740{
3741 int rv = -EIO;
3742
0778286a 3743 drbd_resume_al(mdev);
b411b363
PR
3744 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3745 drbd_bm_clear_all(mdev);
3746 rv = drbd_bm_write(mdev);
3747 put_ldev(mdev);
3748 }
3749
3750 return rv;
3751}
3752
3753static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3754{
3755 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3756 int rv;
3757
3758 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3759
3760 drbd_bm_lock(mdev, work->why);
3761 rv = work->io_fn(mdev);
3762 drbd_bm_unlock(mdev);
3763
3764 clear_bit(BITMAP_IO, &mdev->flags);
3765 wake_up(&mdev->misc_wait);
3766
3767 if (work->done)
3768 work->done(mdev, rv);
3769
3770 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3771 work->why = NULL;
3772
3773 return 1;
3774}
3775
82f59cc6
LE
3776void drbd_ldev_destroy(struct drbd_conf *mdev)
3777{
3778 lc_destroy(mdev->resync);
3779 mdev->resync = NULL;
3780 lc_destroy(mdev->act_log);
3781 mdev->act_log = NULL;
3782 __no_warn(local,
3783 drbd_free_bc(mdev->ldev);
3784 mdev->ldev = NULL;);
3785
3786 if (mdev->md_io_tmpp) {
3787 __free_page(mdev->md_io_tmpp);
3788 mdev->md_io_tmpp = NULL;
3789 }
3790 clear_bit(GO_DISKLESS, &mdev->flags);
3791}
3792
e9e6f3ec
LE
3793static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3794{
3795 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
3796 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3797 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
3798 * the protected members anymore, though, so once put_ldev reaches zero
3799 * again, it will be safe to free them. */
e9e6f3ec 3800 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
3801 return 1;
3802}
3803
3804void drbd_go_diskless(struct drbd_conf *mdev)
3805{
3806 D_ASSERT(mdev->state.disk == D_FAILED);
3807 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
9d282875 3808 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
e9e6f3ec
LE
3809}
3810
b411b363
PR
3811/**
3812 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3813 * @mdev: DRBD device.
3814 * @io_fn: IO callback to be called when bitmap IO is possible
3815 * @done: callback to be called after the bitmap IO was performed
3816 * @why: Descriptive text of the reason for doing the IO
3817 *
3818 * While IO on the bitmap happens we freeze application IO thus we ensure
3819 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3820 * called from worker context. It MUST NOT be used while a previous such
3821 * work is still pending!
3822 */
3823void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3824 int (*io_fn)(struct drbd_conf *),
3825 void (*done)(struct drbd_conf *, int),
3826 char *why)
3827{
3828 D_ASSERT(current == mdev->worker.task);
3829
3830 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3831 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3832 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3833 if (mdev->bm_io_work.why)
3834 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3835 why, mdev->bm_io_work.why);
3836
3837 mdev->bm_io_work.io_fn = io_fn;
3838 mdev->bm_io_work.done = done;
3839 mdev->bm_io_work.why = why;
3840
3841 set_bit(BITMAP_IO, &mdev->flags);
3842 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3843 if (list_empty(&mdev->bm_io_work.w.list)) {
3844 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3845 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3846 } else
3847 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3848 }
3849}
3850
3851/**
3852 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3853 * @mdev: DRBD device.
3854 * @io_fn: IO callback to be called when bitmap IO is possible
3855 * @why: Descriptive text of the reason for doing the IO
3856 *
3857 * freezes application IO while that the actual IO operations runs. This
3858 * functions MAY NOT be called from worker context.
3859 */
3860int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3861{
3862 int rv;
3863
3864 D_ASSERT(current != mdev->worker.task);
3865
3866 drbd_suspend_io(mdev);
3867
3868 drbd_bm_lock(mdev, why);
3869 rv = io_fn(mdev);
3870 drbd_bm_unlock(mdev);
3871
3872 drbd_resume_io(mdev);
3873
3874 return rv;
3875}
3876
3877void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3878{
3879 if ((mdev->ldev->md.flags & flag) != flag) {
3880 drbd_md_mark_dirty(mdev);
3881 mdev->ldev->md.flags |= flag;
3882 }
3883}
3884
3885void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3886{
3887 if ((mdev->ldev->md.flags & flag) != 0) {
3888 drbd_md_mark_dirty(mdev);
3889 mdev->ldev->md.flags &= ~flag;
3890 }
3891}
3892int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3893{
3894 return (bdev->md.flags & flag) != 0;
3895}
3896
3897static void md_sync_timer_fn(unsigned long data)
3898{
3899 struct drbd_conf *mdev = (struct drbd_conf *) data;
3900
3901 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3902}
3903
3904static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3905{
3906 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
3907#ifdef DEBUG
3908 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3909 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3910#endif
b411b363 3911 drbd_md_sync(mdev);
b411b363
PR
3912 return 1;
3913}
3914
3915#ifdef CONFIG_DRBD_FAULT_INJECTION
3916/* Fault insertion support including random number generator shamelessly
3917 * stolen from kernel/rcutorture.c */
3918struct fault_random_state {
3919 unsigned long state;
3920 unsigned long count;
3921};
3922
3923#define FAULT_RANDOM_MULT 39916801 /* prime */
3924#define FAULT_RANDOM_ADD 479001701 /* prime */
3925#define FAULT_RANDOM_REFRESH 10000
3926
3927/*
3928 * Crude but fast random-number generator. Uses a linear congruential
3929 * generator, with occasional help from get_random_bytes().
3930 */
3931static unsigned long
3932_drbd_fault_random(struct fault_random_state *rsp)
3933{
3934 long refresh;
3935
49829ea7 3936 if (!rsp->count--) {
b411b363
PR
3937 get_random_bytes(&refresh, sizeof(refresh));
3938 rsp->state += refresh;
3939 rsp->count = FAULT_RANDOM_REFRESH;
3940 }
3941 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3942 return swahw32(rsp->state);
3943}
3944
3945static char *
3946_drbd_fault_str(unsigned int type) {
3947 static char *_faults[] = {
3948 [DRBD_FAULT_MD_WR] = "Meta-data write",
3949 [DRBD_FAULT_MD_RD] = "Meta-data read",
3950 [DRBD_FAULT_RS_WR] = "Resync write",
3951 [DRBD_FAULT_RS_RD] = "Resync read",
3952 [DRBD_FAULT_DT_WR] = "Data write",
3953 [DRBD_FAULT_DT_RD] = "Data read",
3954 [DRBD_FAULT_DT_RA] = "Data read ahead",
3955 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3956 [DRBD_FAULT_AL_EE] = "EE allocation",
3957 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3958 };
3959
3960 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3961}
3962
3963unsigned int
3964_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3965{
3966 static struct fault_random_state rrs = {0, 0};
3967
3968 unsigned int ret = (
3969 (fault_devs == 0 ||
3970 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3971 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3972
3973 if (ret) {
3974 fault_count++;
3975
7383506c 3976 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
3977 dev_warn(DEV, "***Simulating %s failure\n",
3978 _drbd_fault_str(type));
3979 }
3980
3981 return ret;
3982}
3983#endif
3984
3985const char *drbd_buildtag(void)
3986{
3987 /* DRBD built from external sources has here a reference to the
3988 git hash of the source code. */
3989
3990 static char buildtag[38] = "\0uilt-in";
3991
3992 if (buildtag[0] == 0) {
3993#ifdef CONFIG_MODULES
3994 if (THIS_MODULE != NULL)
3995 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3996 else
3997#endif
3998 buildtag[0] = 'b';
3999 }
4000
4001 return buildtag;
4002}
4003
4004module_init(drbd_init)
4005module_exit(drbd_cleanup)
4006
b411b363
PR
4007EXPORT_SYMBOL(drbd_conn_str);
4008EXPORT_SYMBOL(drbd_role_str);
4009EXPORT_SYMBOL(drbd_disk_str);
4010EXPORT_SYMBOL(drbd_set_st_err_str);