]> bbs.cooldavid.org Git - net-next-2.6.git/blame - drivers/block/drbd/drbd_main.c
drbd: Actually send delay probes
[net-next-2.6.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
b411b363
PR
81MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82 "Lars Ellenberg <lars@linbit.com>");
83MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84MODULE_VERSION(REL_VERSION);
85MODULE_LICENSE("GPL");
86MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89#include <linux/moduleparam.h>
90/* allow_open_on_secondary */
91MODULE_PARM_DESC(allow_oos, "DONT USE!");
92/* thanks to these macros, if compiled into the kernel (not-module),
93 * this becomes the boot parameter drbd.minor_count */
94module_param(minor_count, uint, 0444);
95module_param(disable_sendpage, bool, 0644);
96module_param(allow_oos, bool, 0);
97module_param(cn_idx, uint, 0444);
98module_param(proc_details, int, 0644);
99
100#ifdef CONFIG_DRBD_FAULT_INJECTION
101int enable_faults;
102int fault_rate;
103static int fault_count;
104int fault_devs;
105/* bitmap of enabled faults */
106module_param(enable_faults, int, 0664);
107/* fault rate % value - applies to all enabled faults */
108module_param(fault_rate, int, 0664);
109/* count of faults inserted */
110module_param(fault_count, int, 0664);
111/* bitmap of devices to insert faults on */
112module_param(fault_devs, int, 0644);
113#endif
114
115/* module parameter, defined */
116unsigned int minor_count = 32;
117int disable_sendpage;
118int allow_oos;
119unsigned int cn_idx = CN_IDX_DRBD;
120int proc_details; /* Detail level in proc drbd*/
121
122/* Module parameter for setting the user mode helper program
123 * to run. Default is /sbin/drbdadm */
124char usermode_helper[80] = "/sbin/drbdadm";
125
126module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128/* in 2.6.x, our device mapping and config info contains our virtual gendisks
129 * as member "struct gendisk *vdisk;"
130 */
131struct drbd_conf **minor_table;
132
133struct kmem_cache *drbd_request_cache;
134struct kmem_cache *drbd_ee_cache; /* epoch entries */
135struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
136struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
137mempool_t *drbd_request_mempool;
138mempool_t *drbd_ee_mempool;
139
140/* I do not use a standard mempool, because:
141 1) I want to hand out the pre-allocated objects first.
142 2) I want to be able to interrupt sleeping allocation with a signal.
143 Note: This is a single linked list, the next pointer is the private
144 member of struct page.
145 */
146struct page *drbd_pp_pool;
147spinlock_t drbd_pp_lock;
148int drbd_pp_vacant;
149wait_queue_head_t drbd_pp_wait;
150
151DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
7d4e9d09 153static const struct block_device_operations drbd_ops = {
b411b363
PR
154 .owner = THIS_MODULE,
155 .open = drbd_open,
156 .release = drbd_release,
157};
158
159#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161#ifdef __CHECKER__
162/* When checking with sparse, and this is an inline function, sparse will
163 give tons of false positives. When this is a real functions sparse works.
164 */
165int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166{
167 int io_allowed;
168
169 atomic_inc(&mdev->local_cnt);
170 io_allowed = (mdev->state.disk >= mins);
171 if (!io_allowed) {
172 if (atomic_dec_and_test(&mdev->local_cnt))
173 wake_up(&mdev->misc_wait);
174 }
175 return io_allowed;
176}
177
178#endif
179
180/**
181 * DOC: The transfer log
182 *
183 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185 * of the list. There is always at least one &struct drbd_tl_epoch object.
186 *
187 * Each &struct drbd_tl_epoch has a circular double linked list of requests
188 * attached.
189 */
190static int tl_init(struct drbd_conf *mdev)
191{
192 struct drbd_tl_epoch *b;
193
194 /* during device minor initialization, we may well use GFP_KERNEL */
195 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196 if (!b)
197 return 0;
198 INIT_LIST_HEAD(&b->requests);
199 INIT_LIST_HEAD(&b->w.list);
200 b->next = NULL;
201 b->br_number = 4711;
202 b->n_req = 0;
203 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205 mdev->oldest_tle = b;
206 mdev->newest_tle = b;
207 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209 mdev->tl_hash = NULL;
210 mdev->tl_hash_s = 0;
211
212 return 1;
213}
214
215static void tl_cleanup(struct drbd_conf *mdev)
216{
217 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219 kfree(mdev->oldest_tle);
220 mdev->oldest_tle = NULL;
221 kfree(mdev->unused_spare_tle);
222 mdev->unused_spare_tle = NULL;
223 kfree(mdev->tl_hash);
224 mdev->tl_hash = NULL;
225 mdev->tl_hash_s = 0;
226}
227
228/**
229 * _tl_add_barrier() - Adds a barrier to the transfer log
230 * @mdev: DRBD device.
231 * @new: Barrier to be added before the current head of the TL.
232 *
233 * The caller must hold the req_lock.
234 */
235void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236{
237 struct drbd_tl_epoch *newest_before;
238
239 INIT_LIST_HEAD(&new->requests);
240 INIT_LIST_HEAD(&new->w.list);
241 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242 new->next = NULL;
243 new->n_req = 0;
244
245 newest_before = mdev->newest_tle;
246 /* never send a barrier number == 0, because that is special-cased
247 * when using TCQ for our write ordering code */
248 new->br_number = (newest_before->br_number+1) ?: 1;
249 if (mdev->newest_tle != new) {
250 mdev->newest_tle->next = new;
251 mdev->newest_tle = new;
252 }
253}
254
255/**
256 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257 * @mdev: DRBD device.
258 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259 * @set_size: Expected number of requests before that barrier.
260 *
261 * In case the passed barrier_nr or set_size does not match the oldest
262 * &struct drbd_tl_epoch objects this function will cause a termination
263 * of the connection.
264 */
265void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266 unsigned int set_size)
267{
268 struct drbd_tl_epoch *b, *nob; /* next old barrier */
269 struct list_head *le, *tle;
270 struct drbd_request *r;
271
272 spin_lock_irq(&mdev->req_lock);
273
274 b = mdev->oldest_tle;
275
276 /* first some paranoia code */
277 if (b == NULL) {
278 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279 barrier_nr);
280 goto bail;
281 }
282 if (b->br_number != barrier_nr) {
283 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284 barrier_nr, b->br_number);
285 goto bail;
286 }
287 if (b->n_req != set_size) {
288 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
289 barrier_nr, set_size, b->n_req);
290 goto bail;
291 }
292
293 /* Clean up list of requests processed during current epoch */
294 list_for_each_safe(le, tle, &b->requests) {
295 r = list_entry(le, struct drbd_request, tl_requests);
296 _req_mod(r, barrier_acked);
297 }
298 /* There could be requests on the list waiting for completion
299 of the write to the local disk. To avoid corruptions of
300 slab's data structures we have to remove the lists head.
301
302 Also there could have been a barrier ack out of sequence, overtaking
303 the write acks - which would be a bug and violating write ordering.
304 To not deadlock in case we lose connection while such requests are
305 still pending, we need some way to find them for the
306 _req_mode(connection_lost_while_pending).
307
308 These have been list_move'd to the out_of_sequence_requests list in
309 _req_mod(, barrier_acked) above.
310 */
311 list_del_init(&b->requests);
312
313 nob = b->next;
314 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315 _tl_add_barrier(mdev, b);
316 if (nob)
317 mdev->oldest_tle = nob;
318 /* if nob == NULL b was the only barrier, and becomes the new
319 barrier. Therefore mdev->oldest_tle points already to b */
320 } else {
321 D_ASSERT(nob != NULL);
322 mdev->oldest_tle = nob;
323 kfree(b);
324 }
325
326 spin_unlock_irq(&mdev->req_lock);
327 dec_ap_pending(mdev);
328
329 return;
330
331bail:
332 spin_unlock_irq(&mdev->req_lock);
333 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334}
335
336
337/**
338 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
339 * @mdev: DRBD device.
340 *
341 * This is called after the connection to the peer was lost. The storage covered
342 * by the requests on the transfer gets marked as our of sync. Called from the
343 * receiver thread and the worker thread.
344 */
345void tl_clear(struct drbd_conf *mdev)
346{
347 struct drbd_tl_epoch *b, *tmp;
348 struct list_head *le, *tle;
349 struct drbd_request *r;
350 int new_initial_bnr = net_random();
351
352 spin_lock_irq(&mdev->req_lock);
353
354 b = mdev->oldest_tle;
355 while (b) {
356 list_for_each_safe(le, tle, &b->requests) {
357 r = list_entry(le, struct drbd_request, tl_requests);
358 /* It would be nice to complete outside of spinlock.
359 * But this is easier for now. */
360 _req_mod(r, connection_lost_while_pending);
361 }
362 tmp = b->next;
363
364 /* there could still be requests on that ring list,
365 * in case local io is still pending */
366 list_del(&b->requests);
367
368 /* dec_ap_pending corresponding to queue_barrier.
369 * the newest barrier may not have been queued yet,
370 * in which case w.cb is still NULL. */
371 if (b->w.cb != NULL)
372 dec_ap_pending(mdev);
373
374 if (b == mdev->newest_tle) {
375 /* recycle, but reinit! */
376 D_ASSERT(tmp == NULL);
377 INIT_LIST_HEAD(&b->requests);
378 INIT_LIST_HEAD(&b->w.list);
379 b->w.cb = NULL;
380 b->br_number = new_initial_bnr;
381 b->n_req = 0;
382
383 mdev->oldest_tle = b;
384 break;
385 }
386 kfree(b);
387 b = tmp;
388 }
389
390 /* we expect this list to be empty. */
391 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
392
393 /* but just in case, clean it up anyways! */
394 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
395 r = list_entry(le, struct drbd_request, tl_requests);
396 /* It would be nice to complete outside of spinlock.
397 * But this is easier for now. */
398 _req_mod(r, connection_lost_while_pending);
399 }
400
401 /* ensure bit indicating barrier is required is clear */
402 clear_bit(CREATE_BARRIER, &mdev->flags);
403
404 spin_unlock_irq(&mdev->req_lock);
405}
406
407/**
408 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
409 * @mdev: DRBD device.
410 * @os: old (current) state.
411 * @ns: new (wanted) state.
412 */
413static int cl_wide_st_chg(struct drbd_conf *mdev,
414 union drbd_state os, union drbd_state ns)
415{
416 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
417 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
418 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
419 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
420 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
421 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
422 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
423}
424
425int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
426 union drbd_state mask, union drbd_state val)
427{
428 unsigned long flags;
429 union drbd_state os, ns;
430 int rv;
431
432 spin_lock_irqsave(&mdev->req_lock, flags);
433 os = mdev->state;
434 ns.i = (os.i & ~mask.i) | val.i;
435 rv = _drbd_set_state(mdev, ns, f, NULL);
436 ns = mdev->state;
437 spin_unlock_irqrestore(&mdev->req_lock, flags);
438
439 return rv;
440}
441
442/**
443 * drbd_force_state() - Impose a change which happens outside our control on our state
444 * @mdev: DRBD device.
445 * @mask: mask of state bits to change.
446 * @val: value of new state bits.
447 */
448void drbd_force_state(struct drbd_conf *mdev,
449 union drbd_state mask, union drbd_state val)
450{
451 drbd_change_state(mdev, CS_HARD, mask, val);
452}
453
454static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
455static int is_valid_state_transition(struct drbd_conf *,
456 union drbd_state, union drbd_state);
457static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458 union drbd_state ns, int *warn_sync_abort);
459int drbd_send_state_req(struct drbd_conf *,
460 union drbd_state, union drbd_state);
461
462static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
463 union drbd_state mask, union drbd_state val)
464{
465 union drbd_state os, ns;
466 unsigned long flags;
467 int rv;
468
469 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
470 return SS_CW_SUCCESS;
471
472 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
473 return SS_CW_FAILED_BY_PEER;
474
475 rv = 0;
476 spin_lock_irqsave(&mdev->req_lock, flags);
477 os = mdev->state;
478 ns.i = (os.i & ~mask.i) | val.i;
479 ns = sanitize_state(mdev, os, ns, NULL);
480
481 if (!cl_wide_st_chg(mdev, os, ns))
482 rv = SS_CW_NO_NEED;
483 if (!rv) {
484 rv = is_valid_state(mdev, ns);
485 if (rv == SS_SUCCESS) {
486 rv = is_valid_state_transition(mdev, ns, os);
487 if (rv == SS_SUCCESS)
488 rv = 0; /* cont waiting, otherwise fail. */
489 }
490 }
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_req_state() - Perform an eventually cluster wide state change
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 * @f: flags
502 *
503 * Should not be called directly, use drbd_request_state() or
504 * _drbd_request_state().
505 */
506static int drbd_req_state(struct drbd_conf *mdev,
507 union drbd_state mask, union drbd_state val,
508 enum chg_state_flags f)
509{
510 struct completion done;
511 unsigned long flags;
512 union drbd_state os, ns;
513 int rv;
514
515 init_completion(&done);
516
517 if (f & CS_SERIALIZE)
518 mutex_lock(&mdev->state_mutex);
519
520 spin_lock_irqsave(&mdev->req_lock, flags);
521 os = mdev->state;
522 ns.i = (os.i & ~mask.i) | val.i;
523 ns = sanitize_state(mdev, os, ns, NULL);
524
525 if (cl_wide_st_chg(mdev, os, ns)) {
526 rv = is_valid_state(mdev, ns);
527 if (rv == SS_SUCCESS)
528 rv = is_valid_state_transition(mdev, ns, os);
529 spin_unlock_irqrestore(&mdev->req_lock, flags);
530
531 if (rv < SS_SUCCESS) {
532 if (f & CS_VERBOSE)
533 print_st_err(mdev, os, ns, rv);
534 goto abort;
535 }
536
537 drbd_state_lock(mdev);
538 if (!drbd_send_state_req(mdev, mask, val)) {
539 drbd_state_unlock(mdev);
540 rv = SS_CW_FAILED_BY_PEER;
541 if (f & CS_VERBOSE)
542 print_st_err(mdev, os, ns, rv);
543 goto abort;
544 }
545
546 wait_event(mdev->state_wait,
547 (rv = _req_st_cond(mdev, mask, val)));
548
549 if (rv < SS_SUCCESS) {
550 drbd_state_unlock(mdev);
551 if (f & CS_VERBOSE)
552 print_st_err(mdev, os, ns, rv);
553 goto abort;
554 }
555 spin_lock_irqsave(&mdev->req_lock, flags);
556 os = mdev->state;
557 ns.i = (os.i & ~mask.i) | val.i;
558 rv = _drbd_set_state(mdev, ns, f, &done);
559 drbd_state_unlock(mdev);
560 } else {
561 rv = _drbd_set_state(mdev, ns, f, &done);
562 }
563
564 spin_unlock_irqrestore(&mdev->req_lock, flags);
565
566 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
567 D_ASSERT(current != mdev->worker.task);
568 wait_for_completion(&done);
569 }
570
571abort:
572 if (f & CS_SERIALIZE)
573 mutex_unlock(&mdev->state_mutex);
574
575 return rv;
576}
577
578/**
579 * _drbd_request_state() - Request a state change (with flags)
580 * @mdev: DRBD device.
581 * @mask: mask of state bits to change.
582 * @val: value of new state bits.
583 * @f: flags
584 *
585 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
586 * flag, or when logging of failed state change requests is not desired.
587 */
588int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
589 union drbd_state val, enum chg_state_flags f)
590{
591 int rv;
592
593 wait_event(mdev->state_wait,
594 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
595
596 return rv;
597}
598
599static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
600{
601 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
602 name,
603 drbd_conn_str(ns.conn),
604 drbd_role_str(ns.role),
605 drbd_role_str(ns.peer),
606 drbd_disk_str(ns.disk),
607 drbd_disk_str(ns.pdsk),
608 ns.susp ? 's' : 'r',
609 ns.aftr_isp ? 'a' : '-',
610 ns.peer_isp ? 'p' : '-',
611 ns.user_isp ? 'u' : '-'
612 );
613}
614
615void print_st_err(struct drbd_conf *mdev,
616 union drbd_state os, union drbd_state ns, int err)
617{
618 if (err == SS_IN_TRANSIENT_STATE)
619 return;
620 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
621 print_st(mdev, " state", os);
622 print_st(mdev, "wanted", ns);
623}
624
625
626#define drbd_peer_str drbd_role_str
627#define drbd_pdsk_str drbd_disk_str
628
629#define drbd_susp_str(A) ((A) ? "1" : "0")
630#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
631#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
632#define drbd_user_isp_str(A) ((A) ? "1" : "0")
633
634#define PSC(A) \
635 ({ if (ns.A != os.A) { \
636 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
637 drbd_##A##_str(os.A), \
638 drbd_##A##_str(ns.A)); \
639 } })
640
641/**
642 * is_valid_state() - Returns an SS_ error code if ns is not valid
643 * @mdev: DRBD device.
644 * @ns: State to consider.
645 */
646static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
647{
648 /* See drbd_state_sw_errors in drbd_strings.c */
649
650 enum drbd_fencing_p fp;
651 int rv = SS_SUCCESS;
652
653 fp = FP_DONT_CARE;
654 if (get_ldev(mdev)) {
655 fp = mdev->ldev->dc.fencing;
656 put_ldev(mdev);
657 }
658
659 if (get_net_conf(mdev)) {
660 if (!mdev->net_conf->two_primaries &&
661 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
662 rv = SS_TWO_PRIMARIES;
663 put_net_conf(mdev);
664 }
665
666 if (rv <= 0)
667 /* already found a reason to abort */;
668 else if (ns.role == R_SECONDARY && mdev->open_cnt)
669 rv = SS_DEVICE_IN_USE;
670
671 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
672 rv = SS_NO_UP_TO_DATE_DISK;
673
674 else if (fp >= FP_RESOURCE &&
675 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
676 rv = SS_PRIMARY_NOP;
677
678 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
679 rv = SS_NO_UP_TO_DATE_DISK;
680
681 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
682 rv = SS_NO_LOCAL_DISK;
683
684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685 rv = SS_NO_REMOTE_DISK;
686
8d4ce82b
LE
687 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
688 rv = SS_NO_UP_TO_DATE_DISK;
689
b411b363
PR
690 else if ((ns.conn == C_CONNECTED ||
691 ns.conn == C_WF_BITMAP_S ||
692 ns.conn == C_SYNC_SOURCE ||
693 ns.conn == C_PAUSED_SYNC_S) &&
694 ns.disk == D_OUTDATED)
695 rv = SS_CONNECTED_OUTDATES;
696
697 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
698 (mdev->sync_conf.verify_alg[0] == 0))
699 rv = SS_NO_VERIFY_ALG;
700
701 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
702 mdev->agreed_pro_version < 88)
703 rv = SS_NOT_SUPPORTED;
704
705 return rv;
706}
707
708/**
709 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
710 * @mdev: DRBD device.
711 * @ns: new state.
712 * @os: old state.
713 */
714static int is_valid_state_transition(struct drbd_conf *mdev,
715 union drbd_state ns, union drbd_state os)
716{
717 int rv = SS_SUCCESS;
718
719 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
720 os.conn > C_CONNECTED)
721 rv = SS_RESYNC_RUNNING;
722
723 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
724 rv = SS_ALREADY_STANDALONE;
725
726 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
727 rv = SS_IS_DISKLESS;
728
729 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
730 rv = SS_NO_NET_CONFIG;
731
732 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
733 rv = SS_LOWER_THAN_OUTDATED;
734
735 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
736 rv = SS_IN_TRANSIENT_STATE;
737
738 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
739 rv = SS_IN_TRANSIENT_STATE;
740
741 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
742 rv = SS_NEED_CONNECTION;
743
744 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
745 ns.conn != os.conn && os.conn > C_CONNECTED)
746 rv = SS_RESYNC_RUNNING;
747
748 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
749 os.conn < C_CONNECTED)
750 rv = SS_NEED_CONNECTION;
751
752 return rv;
753}
754
755/**
756 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
757 * @mdev: DRBD device.
758 * @os: old state.
759 * @ns: new state.
760 * @warn_sync_abort:
761 *
762 * When we loose connection, we have to set the state of the peers disk (pdsk)
763 * to D_UNKNOWN. This rule and many more along those lines are in this function.
764 */
765static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
766 union drbd_state ns, int *warn_sync_abort)
767{
768 enum drbd_fencing_p fp;
769
770 fp = FP_DONT_CARE;
771 if (get_ldev(mdev)) {
772 fp = mdev->ldev->dc.fencing;
773 put_ldev(mdev);
774 }
775
776 /* Disallow Network errors to configure a device's network part */
777 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
778 os.conn <= C_DISCONNECTING)
779 ns.conn = os.conn;
780
781 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
782 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
783 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
784 ns.conn = os.conn;
785
786 /* After C_DISCONNECTING only C_STANDALONE may follow */
787 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
788 ns.conn = os.conn;
789
790 if (ns.conn < C_CONNECTED) {
791 ns.peer_isp = 0;
792 ns.peer = R_UNKNOWN;
793 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
794 ns.pdsk = D_UNKNOWN;
795 }
796
797 /* Clear the aftr_isp when becoming unconfigured */
798 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
799 ns.aftr_isp = 0;
800
801 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
802 ns.pdsk = D_UNKNOWN;
803
804 /* Abort resync if a disk fails/detaches */
805 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
806 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
807 if (warn_sync_abort)
808 *warn_sync_abort = 1;
809 ns.conn = C_CONNECTED;
810 }
811
812 if (ns.conn >= C_CONNECTED &&
813 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
814 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
815 switch (ns.conn) {
816 case C_WF_BITMAP_T:
817 case C_PAUSED_SYNC_T:
818 ns.disk = D_OUTDATED;
819 break;
820 case C_CONNECTED:
821 case C_WF_BITMAP_S:
822 case C_SYNC_SOURCE:
823 case C_PAUSED_SYNC_S:
824 ns.disk = D_UP_TO_DATE;
825 break;
826 case C_SYNC_TARGET:
827 ns.disk = D_INCONSISTENT;
828 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
829 break;
830 }
831 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
832 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
833 }
834
835 if (ns.conn >= C_CONNECTED &&
836 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
837 switch (ns.conn) {
838 case C_CONNECTED:
839 case C_WF_BITMAP_T:
840 case C_PAUSED_SYNC_T:
841 case C_SYNC_TARGET:
842 ns.pdsk = D_UP_TO_DATE;
843 break;
844 case C_WF_BITMAP_S:
845 case C_PAUSED_SYNC_S:
e0f83012
LE
846 /* remap any consistent state to D_OUTDATED,
847 * but disallow "upgrade" of not even consistent states.
848 */
849 ns.pdsk =
850 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
851 ? os.pdsk : D_OUTDATED;
b411b363
PR
852 break;
853 case C_SYNC_SOURCE:
854 ns.pdsk = D_INCONSISTENT;
855 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
856 break;
857 }
858 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
859 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
860 }
861
862 /* Connection breaks down before we finished "Negotiating" */
863 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
864 get_ldev_if_state(mdev, D_NEGOTIATING)) {
865 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
866 ns.disk = mdev->new_state_tmp.disk;
867 ns.pdsk = mdev->new_state_tmp.pdsk;
868 } else {
869 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
870 ns.disk = D_DISKLESS;
871 ns.pdsk = D_UNKNOWN;
872 }
873 put_ldev(mdev);
874 }
875
876 if (fp == FP_STONITH &&
0a492166
PR
877 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
878 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
879 ns.susp = 1;
b411b363
PR
880
881 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
882 if (ns.conn == C_SYNC_SOURCE)
883 ns.conn = C_PAUSED_SYNC_S;
884 if (ns.conn == C_SYNC_TARGET)
885 ns.conn = C_PAUSED_SYNC_T;
886 } else {
887 if (ns.conn == C_PAUSED_SYNC_S)
888 ns.conn = C_SYNC_SOURCE;
889 if (ns.conn == C_PAUSED_SYNC_T)
890 ns.conn = C_SYNC_TARGET;
891 }
892
893 return ns;
894}
895
896/* helper for __drbd_set_state */
897static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
898{
899 if (cs == C_VERIFY_T) {
900 /* starting online verify from an arbitrary position
901 * does not fit well into the existing protocol.
902 * on C_VERIFY_T, we initialize ov_left and friends
903 * implicitly in receive_DataRequest once the
904 * first P_OV_REQUEST is received */
905 mdev->ov_start_sector = ~(sector_t)0;
906 } else {
907 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
908 if (bit >= mdev->rs_total)
909 mdev->ov_start_sector =
910 BM_BIT_TO_SECT(mdev->rs_total - 1);
911 mdev->ov_position = mdev->ov_start_sector;
912 }
913}
914
915/**
916 * __drbd_set_state() - Set a new DRBD state
917 * @mdev: DRBD device.
918 * @ns: new state.
919 * @flags: Flags
920 * @done: Optional completion, that will get completed after the after_state_ch() finished
921 *
922 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
923 */
924int __drbd_set_state(struct drbd_conf *mdev,
925 union drbd_state ns, enum chg_state_flags flags,
926 struct completion *done)
927{
928 union drbd_state os;
929 int rv = SS_SUCCESS;
930 int warn_sync_abort = 0;
931 struct after_state_chg_work *ascw;
932
933 os = mdev->state;
934
935 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
936
937 if (ns.i == os.i)
938 return SS_NOTHING_TO_DO;
939
940 if (!(flags & CS_HARD)) {
941 /* pre-state-change checks ; only look at ns */
942 /* See drbd_state_sw_errors in drbd_strings.c */
943
944 rv = is_valid_state(mdev, ns);
945 if (rv < SS_SUCCESS) {
946 /* If the old state was illegal as well, then let
947 this happen...*/
948
949 if (is_valid_state(mdev, os) == rv) {
950 dev_err(DEV, "Considering state change from bad state. "
951 "Error would be: '%s'\n",
952 drbd_set_st_err_str(rv));
953 print_st(mdev, "old", os);
954 print_st(mdev, "new", ns);
955 rv = is_valid_state_transition(mdev, ns, os);
956 }
957 } else
958 rv = is_valid_state_transition(mdev, ns, os);
959 }
960
961 if (rv < SS_SUCCESS) {
962 if (flags & CS_VERBOSE)
963 print_st_err(mdev, os, ns, rv);
964 return rv;
965 }
966
967 if (warn_sync_abort)
968 dev_warn(DEV, "Resync aborted.\n");
969
970 {
971 char *pbp, pb[300];
972 pbp = pb;
973 *pbp = 0;
974 PSC(role);
975 PSC(peer);
976 PSC(conn);
977 PSC(disk);
978 PSC(pdsk);
979 PSC(susp);
980 PSC(aftr_isp);
981 PSC(peer_isp);
982 PSC(user_isp);
983 dev_info(DEV, "%s\n", pb);
984 }
985
986 /* solve the race between becoming unconfigured,
987 * worker doing the cleanup, and
988 * admin reconfiguring us:
989 * on (re)configure, first set CONFIG_PENDING,
990 * then wait for a potentially exiting worker,
991 * start the worker, and schedule one no_op.
992 * then proceed with configuration.
993 */
994 if (ns.disk == D_DISKLESS &&
995 ns.conn == C_STANDALONE &&
996 ns.role == R_SECONDARY &&
997 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
998 set_bit(DEVICE_DYING, &mdev->flags);
999
1000 mdev->state.i = ns.i;
1001 wake_up(&mdev->misc_wait);
1002 wake_up(&mdev->state_wait);
1003
1004 /* post-state-change actions */
1005 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1006 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1007 mod_timer(&mdev->resync_timer, jiffies);
1008 }
1009
1010 /* aborted verify run. log the last position */
1011 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1012 ns.conn < C_CONNECTED) {
1013 mdev->ov_start_sector =
1014 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1015 dev_info(DEV, "Online Verify reached sector %llu\n",
1016 (unsigned long long)mdev->ov_start_sector);
1017 }
1018
1019 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1020 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1021 dev_info(DEV, "Syncer continues.\n");
1022 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1023 if (ns.conn == C_SYNC_TARGET) {
1024 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1025 mod_timer(&mdev->resync_timer, jiffies);
1026 /* This if (!test_bit) is only needed for the case
1027 that a device that has ceased to used its timer,
1028 i.e. it is already in drbd_resync_finished() gets
1029 paused and resumed. */
1030 }
1031 }
1032
1033 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1034 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1035 dev_info(DEV, "Resync suspended\n");
1036 mdev->rs_mark_time = jiffies;
1037 if (ns.conn == C_PAUSED_SYNC_T)
1038 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1039 }
1040
1041 if (os.conn == C_CONNECTED &&
1042 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1043 mdev->ov_position = 0;
1044 mdev->rs_total =
1045 mdev->rs_mark_left = drbd_bm_bits(mdev);
1046 if (mdev->agreed_pro_version >= 90)
1047 set_ov_position(mdev, ns.conn);
1048 else
1049 mdev->ov_start_sector = 0;
1050 mdev->ov_left = mdev->rs_total
1051 - BM_SECT_TO_BIT(mdev->ov_position);
1052 mdev->rs_start =
1053 mdev->rs_mark_time = jiffies;
1054 mdev->ov_last_oos_size = 0;
1055 mdev->ov_last_oos_start = 0;
1056
1057 if (ns.conn == C_VERIFY_S) {
1058 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1059 (unsigned long long)mdev->ov_position);
1060 mod_timer(&mdev->resync_timer, jiffies);
1061 }
1062 }
1063
1064 if (get_ldev(mdev)) {
1065 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1066 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1067 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1068
1069 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1070 mdf |= MDF_CRASHED_PRIMARY;
1071 if (mdev->state.role == R_PRIMARY ||
1072 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1073 mdf |= MDF_PRIMARY_IND;
1074 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1075 mdf |= MDF_CONNECTED_IND;
1076 if (mdev->state.disk > D_INCONSISTENT)
1077 mdf |= MDF_CONSISTENT;
1078 if (mdev->state.disk > D_OUTDATED)
1079 mdf |= MDF_WAS_UP_TO_DATE;
1080 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1081 mdf |= MDF_PEER_OUT_DATED;
1082 if (mdf != mdev->ldev->md.flags) {
1083 mdev->ldev->md.flags = mdf;
1084 drbd_md_mark_dirty(mdev);
1085 }
1086 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1087 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1088 put_ldev(mdev);
1089 }
1090
1091 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1092 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1093 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1094 set_bit(CONSIDER_RESYNC, &mdev->flags);
1095
1096 /* Receiver should clean up itself */
1097 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1098 drbd_thread_stop_nowait(&mdev->receiver);
1099
1100 /* Now the receiver finished cleaning up itself, it should die */
1101 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1102 drbd_thread_stop_nowait(&mdev->receiver);
1103
1104 /* Upon network failure, we need to restart the receiver. */
1105 if (os.conn > C_TEAR_DOWN &&
1106 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1107 drbd_thread_restart_nowait(&mdev->receiver);
1108
1109 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1110 if (ascw) {
1111 ascw->os = os;
1112 ascw->ns = ns;
1113 ascw->flags = flags;
1114 ascw->w.cb = w_after_state_ch;
1115 ascw->done = done;
1116 drbd_queue_work(&mdev->data.work, &ascw->w);
1117 } else {
1118 dev_warn(DEV, "Could not kmalloc an ascw\n");
1119 }
1120
1121 return rv;
1122}
1123
1124static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1125{
1126 struct after_state_chg_work *ascw =
1127 container_of(w, struct after_state_chg_work, w);
1128 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1129 if (ascw->flags & CS_WAIT_COMPLETE) {
1130 D_ASSERT(ascw->done != NULL);
1131 complete(ascw->done);
1132 }
1133 kfree(ascw);
1134
1135 return 1;
1136}
1137
1138static void abw_start_sync(struct drbd_conf *mdev, int rv)
1139{
1140 if (rv) {
1141 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1142 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1143 return;
1144 }
1145
1146 switch (mdev->state.conn) {
1147 case C_STARTING_SYNC_T:
1148 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1149 break;
1150 case C_STARTING_SYNC_S:
1151 drbd_start_resync(mdev, C_SYNC_SOURCE);
1152 break;
1153 }
1154}
1155
1156/**
1157 * after_state_ch() - Perform after state change actions that may sleep
1158 * @mdev: DRBD device.
1159 * @os: old state.
1160 * @ns: new state.
1161 * @flags: Flags
1162 */
1163static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1164 union drbd_state ns, enum chg_state_flags flags)
1165{
1166 enum drbd_fencing_p fp;
1167
1168 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1169 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1170 if (mdev->p_uuid)
1171 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1172 }
1173
1174 fp = FP_DONT_CARE;
1175 if (get_ldev(mdev)) {
1176 fp = mdev->ldev->dc.fencing;
1177 put_ldev(mdev);
1178 }
1179
1180 /* Inform userspace about the change... */
1181 drbd_bcast_state(mdev, ns);
1182
1183 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1184 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1185 drbd_khelper(mdev, "pri-on-incon-degr");
1186
1187 /* Here we have the actions that are performed after a
1188 state change. This function might sleep */
1189
1190 if (fp == FP_STONITH && ns.susp) {
1191 /* case1: The outdate peer handler is successful:
1192 * case2: The connection was established again: */
1193 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1194 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1195 tl_clear(mdev);
1196 spin_lock_irq(&mdev->req_lock);
1197 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1198 spin_unlock_irq(&mdev->req_lock);
1199 }
1200 }
1201 /* Do not change the order of the if above and the two below... */
1202 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1203 drbd_send_uuids(mdev);
1204 drbd_send_state(mdev);
1205 }
1206 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1207 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1208
1209 /* Lost contact to peer's copy of the data */
1210 if ((os.pdsk >= D_INCONSISTENT &&
1211 os.pdsk != D_UNKNOWN &&
1212 os.pdsk != D_OUTDATED)
1213 && (ns.pdsk < D_INCONSISTENT ||
1214 ns.pdsk == D_UNKNOWN ||
1215 ns.pdsk == D_OUTDATED)) {
1216 kfree(mdev->p_uuid);
1217 mdev->p_uuid = NULL;
1218 if (get_ldev(mdev)) {
1219 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1220 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1221 drbd_uuid_new_current(mdev);
1222 drbd_send_uuids(mdev);
1223 }
1224 put_ldev(mdev);
1225 }
1226 }
1227
1228 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1229 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1230 drbd_uuid_new_current(mdev);
1231
1232 /* D_DISKLESS Peer becomes secondary */
1233 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1234 drbd_al_to_on_disk_bm(mdev);
1235 put_ldev(mdev);
1236 }
1237
1238 /* Last part of the attaching process ... */
1239 if (ns.conn >= C_CONNECTED &&
1240 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1241 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1242 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
e89b591c 1243 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
b411b363
PR
1244 drbd_send_uuids(mdev);
1245 drbd_send_state(mdev);
1246 }
1247
1248 /* We want to pause/continue resync, tell peer. */
1249 if (ns.conn >= C_CONNECTED &&
1250 ((os.aftr_isp != ns.aftr_isp) ||
1251 (os.user_isp != ns.user_isp)))
1252 drbd_send_state(mdev);
1253
1254 /* In case one of the isp bits got set, suspend other devices. */
1255 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1256 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1257 suspend_other_sg(mdev);
1258
1259 /* Make sure the peer gets informed about eventual state
1260 changes (ISP bits) while we were in WFReportParams. */
1261 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1262 drbd_send_state(mdev);
1263
1264 /* We are in the progress to start a full sync... */
1265 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1266 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1267 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1268
1269 /* We are invalidating our self... */
1270 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1271 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1272 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1273
1274 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1275 enum drbd_io_error_p eh;
1276
1277 eh = EP_PASS_ON;
1278 if (get_ldev_if_state(mdev, D_FAILED)) {
1279 eh = mdev->ldev->dc.on_io_error;
1280 put_ldev(mdev);
1281 }
1282
1283 drbd_rs_cancel_all(mdev);
1284 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1285 and it is D_DISKLESS here, local_cnt can only go down, it can
1286 not increase... It will reach zero */
1287 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1288 mdev->rs_total = 0;
1289 mdev->rs_failed = 0;
1290 atomic_set(&mdev->rs_pending_cnt, 0);
1291
1292 spin_lock_irq(&mdev->req_lock);
1293 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1294 spin_unlock_irq(&mdev->req_lock);
1295
1296 if (eh == EP_CALL_HELPER)
1297 drbd_khelper(mdev, "local-io-error");
1298 }
1299
1300 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1301
1302 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1303 if (drbd_send_state(mdev))
1304 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1305 else
1306 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1307 }
1308
0a6dbf2b 1309 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
b411b363
PR
1310 lc_destroy(mdev->resync);
1311 mdev->resync = NULL;
1312 lc_destroy(mdev->act_log);
1313 mdev->act_log = NULL;
1314 __no_warn(local,
1315 drbd_free_bc(mdev->ldev);
1316 mdev->ldev = NULL;);
1317
1318 if (mdev->md_io_tmpp)
1319 __free_page(mdev->md_io_tmpp);
1320 }
1321
1322 /* Disks got bigger while they were detached */
1323 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1324 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1325 if (ns.conn == C_CONNECTED)
1326 resync_after_online_grow(mdev);
1327 }
1328
1329 /* A resync finished or aborted, wake paused devices... */
1330 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1331 (os.peer_isp && !ns.peer_isp) ||
1332 (os.user_isp && !ns.user_isp))
1333 resume_next_sg(mdev);
1334
1335 /* Upon network connection, we need to start the receiver */
1336 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1337 drbd_thread_start(&mdev->receiver);
1338
1339 /* Terminate worker thread if we are unconfigured - it will be
1340 restarted as needed... */
1341 if (ns.disk == D_DISKLESS &&
1342 ns.conn == C_STANDALONE &&
1343 ns.role == R_SECONDARY) {
1344 if (os.aftr_isp != ns.aftr_isp)
1345 resume_next_sg(mdev);
1346 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1347 if (test_bit(DEVICE_DYING, &mdev->flags))
1348 drbd_thread_stop_nowait(&mdev->worker);
1349 }
1350
1351 drbd_md_sync(mdev);
1352}
1353
1354
1355static int drbd_thread_setup(void *arg)
1356{
1357 struct drbd_thread *thi = (struct drbd_thread *) arg;
1358 struct drbd_conf *mdev = thi->mdev;
1359 unsigned long flags;
1360 int retval;
1361
1362restart:
1363 retval = thi->function(thi);
1364
1365 spin_lock_irqsave(&thi->t_lock, flags);
1366
1367 /* if the receiver has been "Exiting", the last thing it did
1368 * was set the conn state to "StandAlone",
1369 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1370 * and receiver thread will be "started".
1371 * drbd_thread_start needs to set "Restarting" in that case.
1372 * t_state check and assignment needs to be within the same spinlock,
1373 * so either thread_start sees Exiting, and can remap to Restarting,
1374 * or thread_start see None, and can proceed as normal.
1375 */
1376
1377 if (thi->t_state == Restarting) {
1378 dev_info(DEV, "Restarting %s\n", current->comm);
1379 thi->t_state = Running;
1380 spin_unlock_irqrestore(&thi->t_lock, flags);
1381 goto restart;
1382 }
1383
1384 thi->task = NULL;
1385 thi->t_state = None;
1386 smp_mb();
1387 complete(&thi->stop);
1388 spin_unlock_irqrestore(&thi->t_lock, flags);
1389
1390 dev_info(DEV, "Terminating %s\n", current->comm);
1391
1392 /* Release mod reference taken when thread was started */
1393 module_put(THIS_MODULE);
1394 return retval;
1395}
1396
1397static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1398 int (*func) (struct drbd_thread *))
1399{
1400 spin_lock_init(&thi->t_lock);
1401 thi->task = NULL;
1402 thi->t_state = None;
1403 thi->function = func;
1404 thi->mdev = mdev;
1405}
1406
1407int drbd_thread_start(struct drbd_thread *thi)
1408{
1409 struct drbd_conf *mdev = thi->mdev;
1410 struct task_struct *nt;
1411 unsigned long flags;
1412
1413 const char *me =
1414 thi == &mdev->receiver ? "receiver" :
1415 thi == &mdev->asender ? "asender" :
1416 thi == &mdev->worker ? "worker" : "NONSENSE";
1417
1418 /* is used from state engine doing drbd_thread_stop_nowait,
1419 * while holding the req lock irqsave */
1420 spin_lock_irqsave(&thi->t_lock, flags);
1421
1422 switch (thi->t_state) {
1423 case None:
1424 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1425 me, current->comm, current->pid);
1426
1427 /* Get ref on module for thread - this is released when thread exits */
1428 if (!try_module_get(THIS_MODULE)) {
1429 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1430 spin_unlock_irqrestore(&thi->t_lock, flags);
1431 return FALSE;
1432 }
1433
1434 init_completion(&thi->stop);
1435 D_ASSERT(thi->task == NULL);
1436 thi->reset_cpu_mask = 1;
1437 thi->t_state = Running;
1438 spin_unlock_irqrestore(&thi->t_lock, flags);
1439 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1440
1441 nt = kthread_create(drbd_thread_setup, (void *) thi,
1442 "drbd%d_%s", mdev_to_minor(mdev), me);
1443
1444 if (IS_ERR(nt)) {
1445 dev_err(DEV, "Couldn't start thread\n");
1446
1447 module_put(THIS_MODULE);
1448 return FALSE;
1449 }
1450 spin_lock_irqsave(&thi->t_lock, flags);
1451 thi->task = nt;
1452 thi->t_state = Running;
1453 spin_unlock_irqrestore(&thi->t_lock, flags);
1454 wake_up_process(nt);
1455 break;
1456 case Exiting:
1457 thi->t_state = Restarting;
1458 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1459 me, current->comm, current->pid);
1460 /* fall through */
1461 case Running:
1462 case Restarting:
1463 default:
1464 spin_unlock_irqrestore(&thi->t_lock, flags);
1465 break;
1466 }
1467
1468 return TRUE;
1469}
1470
1471
1472void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1473{
1474 unsigned long flags;
1475
1476 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1477
1478 /* may be called from state engine, holding the req lock irqsave */
1479 spin_lock_irqsave(&thi->t_lock, flags);
1480
1481 if (thi->t_state == None) {
1482 spin_unlock_irqrestore(&thi->t_lock, flags);
1483 if (restart)
1484 drbd_thread_start(thi);
1485 return;
1486 }
1487
1488 if (thi->t_state != ns) {
1489 if (thi->task == NULL) {
1490 spin_unlock_irqrestore(&thi->t_lock, flags);
1491 return;
1492 }
1493
1494 thi->t_state = ns;
1495 smp_mb();
1496 init_completion(&thi->stop);
1497 if (thi->task != current)
1498 force_sig(DRBD_SIGKILL, thi->task);
1499
1500 }
1501
1502 spin_unlock_irqrestore(&thi->t_lock, flags);
1503
1504 if (wait)
1505 wait_for_completion(&thi->stop);
1506}
1507
1508#ifdef CONFIG_SMP
1509/**
1510 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1511 * @mdev: DRBD device.
1512 *
1513 * Forces all threads of a device onto the same CPU. This is beneficial for
1514 * DRBD's performance. May be overwritten by user's configuration.
1515 */
1516void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1517{
1518 int ord, cpu;
1519
1520 /* user override. */
1521 if (cpumask_weight(mdev->cpu_mask))
1522 return;
1523
1524 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1525 for_each_online_cpu(cpu) {
1526 if (ord-- == 0) {
1527 cpumask_set_cpu(cpu, mdev->cpu_mask);
1528 return;
1529 }
1530 }
1531 /* should not be reached */
1532 cpumask_setall(mdev->cpu_mask);
1533}
1534
1535/**
1536 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1537 * @mdev: DRBD device.
1538 *
1539 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1540 * prematurely.
1541 */
1542void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1543{
1544 struct task_struct *p = current;
1545 struct drbd_thread *thi =
1546 p == mdev->asender.task ? &mdev->asender :
1547 p == mdev->receiver.task ? &mdev->receiver :
1548 p == mdev->worker.task ? &mdev->worker :
1549 NULL;
1550 ERR_IF(thi == NULL)
1551 return;
1552 if (!thi->reset_cpu_mask)
1553 return;
1554 thi->reset_cpu_mask = 0;
1555 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1556}
1557#endif
1558
1559/* the appropriate socket mutex must be held already */
1560int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1561 enum drbd_packets cmd, struct p_header *h,
1562 size_t size, unsigned msg_flags)
1563{
1564 int sent, ok;
1565
1566 ERR_IF(!h) return FALSE;
1567 ERR_IF(!size) return FALSE;
1568
1569 h->magic = BE_DRBD_MAGIC;
1570 h->command = cpu_to_be16(cmd);
1571 h->length = cpu_to_be16(size-sizeof(struct p_header));
1572
b411b363
PR
1573 sent = drbd_send(mdev, sock, h, size, msg_flags);
1574
1575 ok = (sent == size);
1576 if (!ok)
1577 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1578 cmdname(cmd), (int)size, sent);
1579 return ok;
1580}
1581
1582/* don't pass the socket. we may only look at it
1583 * when we hold the appropriate socket mutex.
1584 */
1585int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1586 enum drbd_packets cmd, struct p_header *h, size_t size)
1587{
1588 int ok = 0;
1589 struct socket *sock;
1590
1591 if (use_data_socket) {
1592 mutex_lock(&mdev->data.mutex);
1593 sock = mdev->data.socket;
1594 } else {
1595 mutex_lock(&mdev->meta.mutex);
1596 sock = mdev->meta.socket;
1597 }
1598
1599 /* drbd_disconnect() could have called drbd_free_sock()
1600 * while we were waiting in down()... */
1601 if (likely(sock != NULL))
1602 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1603
1604 if (use_data_socket)
1605 mutex_unlock(&mdev->data.mutex);
1606 else
1607 mutex_unlock(&mdev->meta.mutex);
1608 return ok;
1609}
1610
1611int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1612 size_t size)
1613{
1614 struct p_header h;
1615 int ok;
1616
1617 h.magic = BE_DRBD_MAGIC;
1618 h.command = cpu_to_be16(cmd);
1619 h.length = cpu_to_be16(size);
1620
1621 if (!drbd_get_data_sock(mdev))
1622 return 0;
1623
b411b363
PR
1624 ok = (sizeof(h) ==
1625 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1626 ok = ok && (size ==
1627 drbd_send(mdev, mdev->data.socket, data, size, 0));
1628
1629 drbd_put_data_sock(mdev);
1630
1631 return ok;
1632}
1633
1634int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1635{
1636 struct p_rs_param_89 *p;
1637 struct socket *sock;
1638 int size, rv;
1639 const int apv = mdev->agreed_pro_version;
1640
1641 size = apv <= 87 ? sizeof(struct p_rs_param)
1642 : apv == 88 ? sizeof(struct p_rs_param)
1643 + strlen(mdev->sync_conf.verify_alg) + 1
1644 : /* 89 */ sizeof(struct p_rs_param_89);
1645
1646 /* used from admin command context and receiver/worker context.
1647 * to avoid kmalloc, grab the socket right here,
1648 * then use the pre-allocated sbuf there */
1649 mutex_lock(&mdev->data.mutex);
1650 sock = mdev->data.socket;
1651
1652 if (likely(sock != NULL)) {
1653 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1654
1655 p = &mdev->data.sbuf.rs_param_89;
1656
1657 /* initialize verify_alg and csums_alg */
1658 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1659
1660 p->rate = cpu_to_be32(sc->rate);
1661
1662 if (apv >= 88)
1663 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1664 if (apv >= 89)
1665 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1666
1667 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1668 } else
1669 rv = 0; /* not ok */
1670
1671 mutex_unlock(&mdev->data.mutex);
1672
1673 return rv;
1674}
1675
1676int drbd_send_protocol(struct drbd_conf *mdev)
1677{
1678 struct p_protocol *p;
cf14c2e9 1679 int size, cf, rv;
b411b363
PR
1680
1681 size = sizeof(struct p_protocol);
1682
1683 if (mdev->agreed_pro_version >= 87)
1684 size += strlen(mdev->net_conf->integrity_alg) + 1;
1685
1686 /* we must not recurse into our own queue,
1687 * as that is blocked during handshake */
1688 p = kmalloc(size, GFP_NOIO);
1689 if (p == NULL)
1690 return 0;
1691
1692 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1693 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1694 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1695 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
b411b363
PR
1696 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1697
cf14c2e9
PR
1698 cf = 0;
1699 if (mdev->net_conf->want_lose)
1700 cf |= CF_WANT_LOSE;
1701 if (mdev->net_conf->dry_run) {
1702 if (mdev->agreed_pro_version >= 92)
1703 cf |= CF_DRY_RUN;
1704 else {
1705 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 1706 kfree(p);
cf14c2e9
PR
1707 return 0;
1708 }
1709 }
1710 p->conn_flags = cpu_to_be32(cf);
1711
b411b363
PR
1712 if (mdev->agreed_pro_version >= 87)
1713 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1714
1715 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1716 (struct p_header *)p, size);
1717 kfree(p);
1718 return rv;
1719}
1720
1721int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1722{
1723 struct p_uuids p;
1724 int i;
1725
1726 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1727 return 1;
1728
1729 for (i = UI_CURRENT; i < UI_SIZE; i++)
1730 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1731
1732 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1733 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1734 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1735 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1736 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1737 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1738
1739 put_ldev(mdev);
1740
1741 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1742 (struct p_header *)&p, sizeof(p));
1743}
1744
1745int drbd_send_uuids(struct drbd_conf *mdev)
1746{
1747 return _drbd_send_uuids(mdev, 0);
1748}
1749
1750int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1751{
1752 return _drbd_send_uuids(mdev, 8);
1753}
1754
1755
1756int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1757{
1758 struct p_rs_uuid p;
1759
1760 p.uuid = cpu_to_be64(val);
1761
1762 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1763 (struct p_header *)&p, sizeof(p));
1764}
1765
e89b591c 1766int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
1767{
1768 struct p_sizes p;
1769 sector_t d_size, u_size;
1770 int q_order_type;
1771 int ok;
1772
1773 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1774 D_ASSERT(mdev->ldev->backing_bdev);
1775 d_size = drbd_get_max_capacity(mdev->ldev);
1776 u_size = mdev->ldev->dc.disk_size;
1777 q_order_type = drbd_queue_order_type(mdev);
b411b363
PR
1778 put_ldev(mdev);
1779 } else {
1780 d_size = 0;
1781 u_size = 0;
1782 q_order_type = QUEUE_ORDERED_NONE;
1783 }
1784
1785 p.d_size = cpu_to_be64(d_size);
1786 p.u_size = cpu_to_be64(u_size);
1787 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1788 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
e89b591c
PR
1789 p.queue_order_type = cpu_to_be16(q_order_type);
1790 p.dds_flags = cpu_to_be16(flags);
b411b363
PR
1791
1792 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1793 (struct p_header *)&p, sizeof(p));
1794 return ok;
1795}
1796
1797/**
1798 * drbd_send_state() - Sends the drbd state to the peer
1799 * @mdev: DRBD device.
1800 */
1801int drbd_send_state(struct drbd_conf *mdev)
1802{
1803 struct socket *sock;
1804 struct p_state p;
1805 int ok = 0;
1806
1807 /* Grab state lock so we wont send state if we're in the middle
1808 * of a cluster wide state change on another thread */
1809 drbd_state_lock(mdev);
1810
1811 mutex_lock(&mdev->data.mutex);
1812
1813 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1814 sock = mdev->data.socket;
1815
1816 if (likely(sock != NULL)) {
1817 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1818 (struct p_header *)&p, sizeof(p), 0);
1819 }
1820
1821 mutex_unlock(&mdev->data.mutex);
1822
1823 drbd_state_unlock(mdev);
1824 return ok;
1825}
1826
1827int drbd_send_state_req(struct drbd_conf *mdev,
1828 union drbd_state mask, union drbd_state val)
1829{
1830 struct p_req_state p;
1831
1832 p.mask = cpu_to_be32(mask.i);
1833 p.val = cpu_to_be32(val.i);
1834
1835 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1836 (struct p_header *)&p, sizeof(p));
1837}
1838
1839int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1840{
1841 struct p_req_state_reply p;
1842
1843 p.retcode = cpu_to_be32(retcode);
1844
1845 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1846 (struct p_header *)&p, sizeof(p));
1847}
1848
1849int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1850 struct p_compressed_bm *p,
1851 struct bm_xfer_ctx *c)
1852{
1853 struct bitstream bs;
1854 unsigned long plain_bits;
1855 unsigned long tmp;
1856 unsigned long rl;
1857 unsigned len;
1858 unsigned toggle;
1859 int bits;
1860
1861 /* may we use this feature? */
1862 if ((mdev->sync_conf.use_rle == 0) ||
1863 (mdev->agreed_pro_version < 90))
1864 return 0;
1865
1866 if (c->bit_offset >= c->bm_bits)
1867 return 0; /* nothing to do. */
1868
1869 /* use at most thus many bytes */
1870 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1871 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1872 /* plain bits covered in this code string */
1873 plain_bits = 0;
1874
1875 /* p->encoding & 0x80 stores whether the first run length is set.
1876 * bit offset is implicit.
1877 * start with toggle == 2 to be able to tell the first iteration */
1878 toggle = 2;
1879
1880 /* see how much plain bits we can stuff into one packet
1881 * using RLE and VLI. */
1882 do {
1883 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1884 : _drbd_bm_find_next(mdev, c->bit_offset);
1885 if (tmp == -1UL)
1886 tmp = c->bm_bits;
1887 rl = tmp - c->bit_offset;
1888
1889 if (toggle == 2) { /* first iteration */
1890 if (rl == 0) {
1891 /* the first checked bit was set,
1892 * store start value, */
1893 DCBP_set_start(p, 1);
1894 /* but skip encoding of zero run length */
1895 toggle = !toggle;
1896 continue;
1897 }
1898 DCBP_set_start(p, 0);
1899 }
1900
1901 /* paranoia: catch zero runlength.
1902 * can only happen if bitmap is modified while we scan it. */
1903 if (rl == 0) {
1904 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1905 "t:%u bo:%lu\n", toggle, c->bit_offset);
1906 return -1;
1907 }
1908
1909 bits = vli_encode_bits(&bs, rl);
1910 if (bits == -ENOBUFS) /* buffer full */
1911 break;
1912 if (bits <= 0) {
1913 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1914 return 0;
1915 }
1916
1917 toggle = !toggle;
1918 plain_bits += rl;
1919 c->bit_offset = tmp;
1920 } while (c->bit_offset < c->bm_bits);
1921
1922 len = bs.cur.b - p->code + !!bs.cur.bit;
1923
1924 if (plain_bits < (len << 3)) {
1925 /* incompressible with this method.
1926 * we need to rewind both word and bit position. */
1927 c->bit_offset -= plain_bits;
1928 bm_xfer_ctx_bit_to_word_offset(c);
1929 c->bit_offset = c->word_offset * BITS_PER_LONG;
1930 return 0;
1931 }
1932
1933 /* RLE + VLI was able to compress it just fine.
1934 * update c->word_offset. */
1935 bm_xfer_ctx_bit_to_word_offset(c);
1936
1937 /* store pad_bits */
1938 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1939
1940 return len;
1941}
1942
1943enum { OK, FAILED, DONE }
1944send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1945 struct p_header *h, struct bm_xfer_ctx *c)
1946{
1947 struct p_compressed_bm *p = (void*)h;
1948 unsigned long num_words;
1949 int len;
1950 int ok;
1951
1952 len = fill_bitmap_rle_bits(mdev, p, c);
1953
1954 if (len < 0)
1955 return FAILED;
1956
1957 if (len) {
1958 DCBP_set_code(p, RLE_VLI_Bits);
1959 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1960 sizeof(*p) + len, 0);
1961
1962 c->packets[0]++;
1963 c->bytes[0] += sizeof(*p) + len;
1964
1965 if (c->bit_offset >= c->bm_bits)
1966 len = 0; /* DONE */
1967 } else {
1968 /* was not compressible.
1969 * send a buffer full of plain text bits instead. */
1970 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1971 len = num_words * sizeof(long);
1972 if (len)
1973 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1974 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1975 h, sizeof(struct p_header) + len, 0);
1976 c->word_offset += num_words;
1977 c->bit_offset = c->word_offset * BITS_PER_LONG;
1978
1979 c->packets[1]++;
1980 c->bytes[1] += sizeof(struct p_header) + len;
1981
1982 if (c->bit_offset > c->bm_bits)
1983 c->bit_offset = c->bm_bits;
1984 }
1985 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1986
1987 if (ok == DONE)
1988 INFO_bm_xfer_stats(mdev, "send", c);
1989 return ok;
1990}
1991
1992/* See the comment at receive_bitmap() */
1993int _drbd_send_bitmap(struct drbd_conf *mdev)
1994{
1995 struct bm_xfer_ctx c;
1996 struct p_header *p;
1997 int ret;
1998
1999 ERR_IF(!mdev->bitmap) return FALSE;
2000
2001 /* maybe we should use some per thread scratch page,
2002 * and allocate that during initial device creation? */
2003 p = (struct p_header *) __get_free_page(GFP_NOIO);
2004 if (!p) {
2005 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2006 return FALSE;
2007 }
2008
2009 if (get_ldev(mdev)) {
2010 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2011 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2012 drbd_bm_set_all(mdev);
2013 if (drbd_bm_write(mdev)) {
2014 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2015 * but otherwise process as per normal - need to tell other
2016 * side that a full resync is required! */
2017 dev_err(DEV, "Failed to write bitmap to disk!\n");
2018 } else {
2019 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2020 drbd_md_sync(mdev);
2021 }
2022 }
2023 put_ldev(mdev);
2024 }
2025
2026 c = (struct bm_xfer_ctx) {
2027 .bm_bits = drbd_bm_bits(mdev),
2028 .bm_words = drbd_bm_words(mdev),
2029 };
2030
2031 do {
2032 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2033 } while (ret == OK);
2034
2035 free_page((unsigned long) p);
2036 return (ret == DONE);
2037}
2038
2039int drbd_send_bitmap(struct drbd_conf *mdev)
2040{
2041 int err;
2042
2043 if (!drbd_get_data_sock(mdev))
2044 return -1;
2045 err = !_drbd_send_bitmap(mdev);
2046 drbd_put_data_sock(mdev);
2047 return err;
2048}
2049
2050int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2051{
2052 int ok;
2053 struct p_barrier_ack p;
2054
2055 p.barrier = barrier_nr;
2056 p.set_size = cpu_to_be32(set_size);
2057
2058 if (mdev->state.conn < C_CONNECTED)
2059 return FALSE;
2060 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2061 (struct p_header *)&p, sizeof(p));
2062 return ok;
2063}
2064
2065/**
2066 * _drbd_send_ack() - Sends an ack packet
2067 * @mdev: DRBD device.
2068 * @cmd: Packet command code.
2069 * @sector: sector, needs to be in big endian byte order
2070 * @blksize: size in byte, needs to be in big endian byte order
2071 * @block_id: Id, big endian byte order
2072 */
2073static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2074 u64 sector,
2075 u32 blksize,
2076 u64 block_id)
2077{
2078 int ok;
2079 struct p_block_ack p;
2080
2081 p.sector = sector;
2082 p.block_id = block_id;
2083 p.blksize = blksize;
2084 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2085
2086 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2087 return FALSE;
2088 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2089 (struct p_header *)&p, sizeof(p));
2090 return ok;
2091}
2092
2093int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2094 struct p_data *dp)
2095{
2096 const int header_size = sizeof(struct p_data)
2097 - sizeof(struct p_header);
2098 int data_size = ((struct p_header *)dp)->length - header_size;
2099
2100 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2101 dp->block_id);
2102}
2103
2104int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2105 struct p_block_req *rp)
2106{
2107 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2108}
2109
2110/**
2111 * drbd_send_ack() - Sends an ack packet
2112 * @mdev: DRBD device.
2113 * @cmd: Packet command code.
2114 * @e: Epoch entry.
2115 */
2116int drbd_send_ack(struct drbd_conf *mdev,
2117 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2118{
2119 return _drbd_send_ack(mdev, cmd,
2120 cpu_to_be64(e->sector),
2121 cpu_to_be32(e->size),
2122 e->block_id);
2123}
2124
2125/* This function misuses the block_id field to signal if the blocks
2126 * are is sync or not. */
2127int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2128 sector_t sector, int blksize, u64 block_id)
2129{
2130 return _drbd_send_ack(mdev, cmd,
2131 cpu_to_be64(sector),
2132 cpu_to_be32(blksize),
2133 cpu_to_be64(block_id));
2134}
2135
2136int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2137 sector_t sector, int size, u64 block_id)
2138{
2139 int ok;
2140 struct p_block_req p;
2141
2142 p.sector = cpu_to_be64(sector);
2143 p.block_id = block_id;
2144 p.blksize = cpu_to_be32(size);
2145
2146 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2147 (struct p_header *)&p, sizeof(p));
2148 return ok;
2149}
2150
2151int drbd_send_drequest_csum(struct drbd_conf *mdev,
2152 sector_t sector, int size,
2153 void *digest, int digest_size,
2154 enum drbd_packets cmd)
2155{
2156 int ok;
2157 struct p_block_req p;
2158
2159 p.sector = cpu_to_be64(sector);
2160 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2161 p.blksize = cpu_to_be32(size);
2162
2163 p.head.magic = BE_DRBD_MAGIC;
2164 p.head.command = cpu_to_be16(cmd);
2165 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2166
2167 mutex_lock(&mdev->data.mutex);
2168
2169 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2170 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2171
2172 mutex_unlock(&mdev->data.mutex);
2173
2174 return ok;
2175}
2176
2177int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2178{
2179 int ok;
2180 struct p_block_req p;
2181
2182 p.sector = cpu_to_be64(sector);
2183 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2184 p.blksize = cpu_to_be32(size);
2185
2186 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2187 (struct p_header *)&p, sizeof(p));
2188 return ok;
2189}
2190
7237bc43
PR
2191static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
2192{
2193 struct p_delay_probe dp;
2194 int offset, ok = 0;
2195 struct timeval now;
2196
2197 mutex_lock(&ds->mutex);
2198 if (likely(ds->socket)) {
2199 do_gettimeofday(&now);
2200 offset = now.tv_usec - mdev->dps_time.tv_usec +
2201 (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
2202 dp.seq_num = cpu_to_be32(atomic_read(&mdev->delay_seq));
2203 dp.offset = cpu_to_be32(offset);
2204
2205 ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
2206 (struct p_header *)&dp, sizeof(dp), 0);
2207 }
2208 mutex_unlock(&ds->mutex);
2209
bd26bfc5
PR
2210 mdev->dp_volume_last = mdev->send_cnt;
2211 mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
2212
7237bc43
PR
2213 return ok;
2214}
2215
bd26bfc5 2216static int drbd_send_delay_probes(struct drbd_conf *mdev)
7237bc43
PR
2217{
2218 int ok;
2219 atomic_inc(&mdev->delay_seq);
2220 do_gettimeofday(&mdev->dps_time);
2221 ok = drbd_send_delay_probe(mdev, &mdev->meta);
2222 ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
2223
2224 return ok;
2225}
2226
b411b363
PR
2227/* called on sndtimeo
2228 * returns FALSE if we should retry,
2229 * TRUE if we think connection is dead
2230 */
2231static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2232{
2233 int drop_it;
2234 /* long elapsed = (long)(jiffies - mdev->last_received); */
2235
2236 drop_it = mdev->meta.socket == sock
2237 || !mdev->asender.task
2238 || get_t_state(&mdev->asender) != Running
2239 || mdev->state.conn < C_CONNECTED;
2240
2241 if (drop_it)
2242 return TRUE;
2243
2244 drop_it = !--mdev->ko_count;
2245 if (!drop_it) {
2246 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2247 current->comm, current->pid, mdev->ko_count);
2248 request_ping(mdev);
2249 }
2250
2251 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2252}
2253
2254/* The idea of sendpage seems to be to put some kind of reference
2255 * to the page into the skb, and to hand it over to the NIC. In
2256 * this process get_page() gets called.
2257 *
2258 * As soon as the page was really sent over the network put_page()
2259 * gets called by some part of the network layer. [ NIC driver? ]
2260 *
2261 * [ get_page() / put_page() increment/decrement the count. If count
2262 * reaches 0 the page will be freed. ]
2263 *
2264 * This works nicely with pages from FSs.
2265 * But this means that in protocol A we might signal IO completion too early!
2266 *
2267 * In order not to corrupt data during a resync we must make sure
2268 * that we do not reuse our own buffer pages (EEs) to early, therefore
2269 * we have the net_ee list.
2270 *
2271 * XFS seems to have problems, still, it submits pages with page_count == 0!
2272 * As a workaround, we disable sendpage on pages
2273 * with page_count == 0 or PageSlab.
2274 */
2275static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2276 int offset, size_t size)
2277{
2278 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2279 kunmap(page);
2280 if (sent == size)
2281 mdev->send_cnt += size>>9;
2282 return sent == size;
2283}
2284
2285static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2286 int offset, size_t size)
2287{
2288 mm_segment_t oldfs = get_fs();
2289 int sent, ok;
2290 int len = size;
2291
2292 /* e.g. XFS meta- & log-data is in slab pages, which have a
2293 * page_count of 0 and/or have PageSlab() set.
2294 * we cannot use send_page for those, as that does get_page();
2295 * put_page(); and would cause either a VM_BUG directly, or
2296 * __page_cache_release a page that would actually still be referenced
2297 * by someone, leading to some obscure delayed Oops somewhere else. */
2298 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2299 return _drbd_no_send_page(mdev, page, offset, size);
2300
2301 drbd_update_congested(mdev);
2302 set_fs(KERNEL_DS);
2303 do {
2304 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2305 offset, len,
2306 MSG_NOSIGNAL);
2307 if (sent == -EAGAIN) {
2308 if (we_should_drop_the_connection(mdev,
2309 mdev->data.socket))
2310 break;
2311 else
2312 continue;
2313 }
2314 if (sent <= 0) {
2315 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2316 __func__, (int)size, len, sent);
2317 break;
2318 }
2319 len -= sent;
2320 offset += sent;
2321 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2322 set_fs(oldfs);
2323 clear_bit(NET_CONGESTED, &mdev->flags);
2324
2325 ok = (len == 0);
2326 if (likely(ok))
2327 mdev->send_cnt += size>>9;
2328 return ok;
2329}
2330
2331static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2332{
2333 struct bio_vec *bvec;
2334 int i;
2335 __bio_for_each_segment(bvec, bio, i, 0) {
2336 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2337 bvec->bv_offset, bvec->bv_len))
2338 return 0;
2339 }
2340 return 1;
2341}
2342
2343static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2344{
2345 struct bio_vec *bvec;
2346 int i;
2347 __bio_for_each_segment(bvec, bio, i, 0) {
2348 if (!_drbd_send_page(mdev, bvec->bv_page,
2349 bvec->bv_offset, bvec->bv_len))
2350 return 0;
2351 }
2352
2353 return 1;
2354}
2355
bd26bfc5
PR
2356static void consider_delay_probes(struct drbd_conf *mdev)
2357{
2358 if (mdev->state.conn != C_SYNC_SOURCE)
2359 return;
2360
2361 if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
2362 drbd_send_delay_probes(mdev);
2363}
2364
2365static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
2366{
2367 if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
2368 drbd_send_delay_probes(mdev);
2369
2370 return 1;
2371}
2372
2373static void delay_probe_timer_fn(unsigned long data)
2374{
2375 struct drbd_conf *mdev = (struct drbd_conf *) data;
2376
2377 drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
2378}
2379
b411b363
PR
2380/* Used to send write requests
2381 * R_PRIMARY -> Peer (P_DATA)
2382 */
2383int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2384{
2385 int ok = 1;
2386 struct p_data p;
2387 unsigned int dp_flags = 0;
2388 void *dgb;
2389 int dgs;
2390
2391 if (!drbd_get_data_sock(mdev))
2392 return 0;
2393
2394 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2395 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2396
2397 p.head.magic = BE_DRBD_MAGIC;
2398 p.head.command = cpu_to_be16(P_DATA);
2399 p.head.length =
2400 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2401
2402 p.sector = cpu_to_be64(req->sector);
2403 p.block_id = (unsigned long)req;
2404 p.seq_num = cpu_to_be32(req->seq_num =
2405 atomic_add_return(1, &mdev->packet_seq));
2406 dp_flags = 0;
2407
2408 /* NOTE: no need to check if barriers supported here as we would
2409 * not pass the test in make_request_common in that case
2410 */
2411 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2412 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2413 /* dp_flags |= DP_HARDBARRIER; */
2414 }
2415 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2416 dp_flags |= DP_RW_SYNC;
2417 /* for now handle SYNCIO and UNPLUG
2418 * as if they still were one and the same flag */
2419 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2420 dp_flags |= DP_RW_SYNC;
2421 if (mdev->state.conn >= C_SYNC_SOURCE &&
2422 mdev->state.conn <= C_PAUSED_SYNC_T)
2423 dp_flags |= DP_MAY_SET_IN_SYNC;
2424
2425 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
2426 set_bit(UNPLUG_REMOTE, &mdev->flags);
2427 ok = (sizeof(p) ==
2428 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2429 if (ok && dgs) {
2430 dgb = mdev->int_dig_out;
2431 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2432 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2433 }
2434 if (ok) {
2435 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2436 ok = _drbd_send_bio(mdev, req->master_bio);
2437 else
2438 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2439 }
2440
2441 drbd_put_data_sock(mdev);
bd26bfc5
PR
2442
2443 if (ok)
2444 consider_delay_probes(mdev);
2445
b411b363
PR
2446 return ok;
2447}
2448
2449/* answer packet, used to send data back for read requests:
2450 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2451 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2452 */
2453int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2454 struct drbd_epoch_entry *e)
2455{
2456 int ok;
2457 struct p_data p;
2458 void *dgb;
2459 int dgs;
2460
2461 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2462 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2463
2464 p.head.magic = BE_DRBD_MAGIC;
2465 p.head.command = cpu_to_be16(cmd);
2466 p.head.length =
2467 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2468
2469 p.sector = cpu_to_be64(e->sector);
2470 p.block_id = e->block_id;
2471 /* p.seq_num = 0; No sequence numbers here.. */
2472
2473 /* Only called by our kernel thread.
2474 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2475 * in response to admin command or module unload.
2476 */
2477 if (!drbd_get_data_sock(mdev))
2478 return 0;
2479
b411b363
PR
2480 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2481 sizeof(p), MSG_MORE);
2482 if (ok && dgs) {
2483 dgb = mdev->int_dig_out;
2484 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2485 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2486 }
2487 if (ok)
2488 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2489
2490 drbd_put_data_sock(mdev);
bd26bfc5
PR
2491
2492 if (ok)
2493 consider_delay_probes(mdev);
2494
b411b363
PR
2495 return ok;
2496}
2497
2498/*
2499 drbd_send distinguishes two cases:
2500
2501 Packets sent via the data socket "sock"
2502 and packets sent via the meta data socket "msock"
2503
2504 sock msock
2505 -----------------+-------------------------+------------------------------
2506 timeout conf.timeout / 2 conf.timeout / 2
2507 timeout action send a ping via msock Abort communication
2508 and close all sockets
2509*/
2510
2511/*
2512 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2513 */
2514int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2515 void *buf, size_t size, unsigned msg_flags)
2516{
2517 struct kvec iov;
2518 struct msghdr msg;
2519 int rv, sent = 0;
2520
2521 if (!sock)
2522 return -1000;
2523
2524 /* THINK if (signal_pending) return ... ? */
2525
2526 iov.iov_base = buf;
2527 iov.iov_len = size;
2528
2529 msg.msg_name = NULL;
2530 msg.msg_namelen = 0;
2531 msg.msg_control = NULL;
2532 msg.msg_controllen = 0;
2533 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2534
2535 if (sock == mdev->data.socket) {
2536 mdev->ko_count = mdev->net_conf->ko_count;
2537 drbd_update_congested(mdev);
2538 }
2539 do {
2540 /* STRANGE
2541 * tcp_sendmsg does _not_ use its size parameter at all ?
2542 *
2543 * -EAGAIN on timeout, -EINTR on signal.
2544 */
2545/* THINK
2546 * do we need to block DRBD_SIG if sock == &meta.socket ??
2547 * otherwise wake_asender() might interrupt some send_*Ack !
2548 */
2549 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2550 if (rv == -EAGAIN) {
2551 if (we_should_drop_the_connection(mdev, sock))
2552 break;
2553 else
2554 continue;
2555 }
2556 D_ASSERT(rv != 0);
2557 if (rv == -EINTR) {
2558 flush_signals(current);
2559 rv = 0;
2560 }
2561 if (rv < 0)
2562 break;
2563 sent += rv;
2564 iov.iov_base += rv;
2565 iov.iov_len -= rv;
2566 } while (sent < size);
2567
2568 if (sock == mdev->data.socket)
2569 clear_bit(NET_CONGESTED, &mdev->flags);
2570
2571 if (rv <= 0) {
2572 if (rv != -EAGAIN) {
2573 dev_err(DEV, "%s_sendmsg returned %d\n",
2574 sock == mdev->meta.socket ? "msock" : "sock",
2575 rv);
2576 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2577 } else
2578 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2579 }
2580
2581 return sent;
2582}
2583
2584static int drbd_open(struct block_device *bdev, fmode_t mode)
2585{
2586 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2587 unsigned long flags;
2588 int rv = 0;
2589
2590 spin_lock_irqsave(&mdev->req_lock, flags);
2591 /* to have a stable mdev->state.role
2592 * and no race with updating open_cnt */
2593
2594 if (mdev->state.role != R_PRIMARY) {
2595 if (mode & FMODE_WRITE)
2596 rv = -EROFS;
2597 else if (!allow_oos)
2598 rv = -EMEDIUMTYPE;
2599 }
2600
2601 if (!rv)
2602 mdev->open_cnt++;
2603 spin_unlock_irqrestore(&mdev->req_lock, flags);
2604
2605 return rv;
2606}
2607
2608static int drbd_release(struct gendisk *gd, fmode_t mode)
2609{
2610 struct drbd_conf *mdev = gd->private_data;
2611 mdev->open_cnt--;
2612 return 0;
2613}
2614
2615static void drbd_unplug_fn(struct request_queue *q)
2616{
2617 struct drbd_conf *mdev = q->queuedata;
2618
b411b363
PR
2619 /* unplug FIRST */
2620 spin_lock_irq(q->queue_lock);
2621 blk_remove_plug(q);
2622 spin_unlock_irq(q->queue_lock);
2623
2624 /* only if connected */
2625 spin_lock_irq(&mdev->req_lock);
2626 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2627 D_ASSERT(mdev->state.role == R_PRIMARY);
2628 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2629 /* add to the data.work queue,
2630 * unless already queued.
2631 * XXX this might be a good addition to drbd_queue_work
2632 * anyways, to detect "double queuing" ... */
2633 if (list_empty(&mdev->unplug_work.list))
2634 drbd_queue_work(&mdev->data.work,
2635 &mdev->unplug_work);
2636 }
2637 }
2638 spin_unlock_irq(&mdev->req_lock);
2639
2640 if (mdev->state.disk >= D_INCONSISTENT)
2641 drbd_kick_lo(mdev);
2642}
2643
2644static void drbd_set_defaults(struct drbd_conf *mdev)
2645{
2646 mdev->sync_conf.after = DRBD_AFTER_DEF;
2647 mdev->sync_conf.rate = DRBD_RATE_DEF;
2648 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2649 mdev->state = (union drbd_state) {
2650 { .role = R_SECONDARY,
2651 .peer = R_UNKNOWN,
2652 .conn = C_STANDALONE,
2653 .disk = D_DISKLESS,
2654 .pdsk = D_UNKNOWN,
2655 .susp = 0
2656 } };
2657}
2658
2659void drbd_init_set_defaults(struct drbd_conf *mdev)
2660{
2661 /* the memset(,0,) did most of this.
2662 * note: only assignments, no allocation in here */
2663
2664 drbd_set_defaults(mdev);
2665
2666 /* for now, we do NOT yet support it,
2667 * even though we start some framework
2668 * to eventually support barriers */
2669 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2670
2671 atomic_set(&mdev->ap_bio_cnt, 0);
2672 atomic_set(&mdev->ap_pending_cnt, 0);
2673 atomic_set(&mdev->rs_pending_cnt, 0);
2674 atomic_set(&mdev->unacked_cnt, 0);
2675 atomic_set(&mdev->local_cnt, 0);
2676 atomic_set(&mdev->net_cnt, 0);
2677 atomic_set(&mdev->packet_seq, 0);
2678 atomic_set(&mdev->pp_in_use, 0);
0ced55a3 2679 atomic_set(&mdev->delay_seq, 0);
b411b363
PR
2680
2681 mutex_init(&mdev->md_io_mutex);
2682 mutex_init(&mdev->data.mutex);
2683 mutex_init(&mdev->meta.mutex);
2684 sema_init(&mdev->data.work.s, 0);
2685 sema_init(&mdev->meta.work.s, 0);
2686 mutex_init(&mdev->state_mutex);
2687
2688 spin_lock_init(&mdev->data.work.q_lock);
2689 spin_lock_init(&mdev->meta.work.q_lock);
2690
2691 spin_lock_init(&mdev->al_lock);
2692 spin_lock_init(&mdev->req_lock);
2693 spin_lock_init(&mdev->peer_seq_lock);
2694 spin_lock_init(&mdev->epoch_lock);
2695
2696 INIT_LIST_HEAD(&mdev->active_ee);
2697 INIT_LIST_HEAD(&mdev->sync_ee);
2698 INIT_LIST_HEAD(&mdev->done_ee);
2699 INIT_LIST_HEAD(&mdev->read_ee);
2700 INIT_LIST_HEAD(&mdev->net_ee);
2701 INIT_LIST_HEAD(&mdev->resync_reads);
2702 INIT_LIST_HEAD(&mdev->data.work.q);
2703 INIT_LIST_HEAD(&mdev->meta.work.q);
2704 INIT_LIST_HEAD(&mdev->resync_work.list);
2705 INIT_LIST_HEAD(&mdev->unplug_work.list);
2706 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2707 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 2708 INIT_LIST_HEAD(&mdev->delay_probes);
bd26bfc5 2709 INIT_LIST_HEAD(&mdev->delay_probe_work.list);
0ced55a3 2710
b411b363
PR
2711 mdev->resync_work.cb = w_resync_inactive;
2712 mdev->unplug_work.cb = w_send_write_hint;
2713 mdev->md_sync_work.cb = w_md_sync;
2714 mdev->bm_io_work.w.cb = w_bitmap_io;
bd26bfc5 2715 mdev->delay_probe_work.cb = w_delay_probes;
b411b363
PR
2716 init_timer(&mdev->resync_timer);
2717 init_timer(&mdev->md_sync_timer);
bd26bfc5 2718 init_timer(&mdev->delay_probe_timer);
b411b363
PR
2719 mdev->resync_timer.function = resync_timer_fn;
2720 mdev->resync_timer.data = (unsigned long) mdev;
2721 mdev->md_sync_timer.function = md_sync_timer_fn;
2722 mdev->md_sync_timer.data = (unsigned long) mdev;
bd26bfc5
PR
2723 mdev->delay_probe_timer.function = delay_probe_timer_fn;
2724 mdev->delay_probe_timer.data = (unsigned long) mdev;
2725
b411b363
PR
2726
2727 init_waitqueue_head(&mdev->misc_wait);
2728 init_waitqueue_head(&mdev->state_wait);
2729 init_waitqueue_head(&mdev->ee_wait);
2730 init_waitqueue_head(&mdev->al_wait);
2731 init_waitqueue_head(&mdev->seq_wait);
2732
2733 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2734 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2735 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2736
2737 mdev->agreed_pro_version = PRO_VERSION_MAX;
2738 mdev->write_ordering = WO_bio_barrier;
2739 mdev->resync_wenr = LC_FREE;
2740}
2741
2742void drbd_mdev_cleanup(struct drbd_conf *mdev)
2743{
2744 if (mdev->receiver.t_state != None)
2745 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2746 mdev->receiver.t_state);
2747
2748 /* no need to lock it, I'm the only thread alive */
2749 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2750 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2751 mdev->al_writ_cnt =
2752 mdev->bm_writ_cnt =
2753 mdev->read_cnt =
2754 mdev->recv_cnt =
2755 mdev->send_cnt =
2756 mdev->writ_cnt =
2757 mdev->p_size =
2758 mdev->rs_start =
2759 mdev->rs_total =
2760 mdev->rs_failed =
2761 mdev->rs_mark_left =
2762 mdev->rs_mark_time = 0;
2763 D_ASSERT(mdev->net_conf == NULL);
2764
2765 drbd_set_my_capacity(mdev, 0);
2766 if (mdev->bitmap) {
2767 /* maybe never allocated. */
02d9a94b 2768 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
2769 drbd_bm_cleanup(mdev);
2770 }
2771
2772 drbd_free_resources(mdev);
2773
2774 /*
2775 * currently we drbd_init_ee only on module load, so
2776 * we may do drbd_release_ee only on module unload!
2777 */
2778 D_ASSERT(list_empty(&mdev->active_ee));
2779 D_ASSERT(list_empty(&mdev->sync_ee));
2780 D_ASSERT(list_empty(&mdev->done_ee));
2781 D_ASSERT(list_empty(&mdev->read_ee));
2782 D_ASSERT(list_empty(&mdev->net_ee));
2783 D_ASSERT(list_empty(&mdev->resync_reads));
2784 D_ASSERT(list_empty(&mdev->data.work.q));
2785 D_ASSERT(list_empty(&mdev->meta.work.q));
2786 D_ASSERT(list_empty(&mdev->resync_work.list));
2787 D_ASSERT(list_empty(&mdev->unplug_work.list));
2788
2789}
2790
2791
2792static void drbd_destroy_mempools(void)
2793{
2794 struct page *page;
2795
2796 while (drbd_pp_pool) {
2797 page = drbd_pp_pool;
2798 drbd_pp_pool = (struct page *)page_private(page);
2799 __free_page(page);
2800 drbd_pp_vacant--;
2801 }
2802
2803 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2804
2805 if (drbd_ee_mempool)
2806 mempool_destroy(drbd_ee_mempool);
2807 if (drbd_request_mempool)
2808 mempool_destroy(drbd_request_mempool);
2809 if (drbd_ee_cache)
2810 kmem_cache_destroy(drbd_ee_cache);
2811 if (drbd_request_cache)
2812 kmem_cache_destroy(drbd_request_cache);
2813 if (drbd_bm_ext_cache)
2814 kmem_cache_destroy(drbd_bm_ext_cache);
2815 if (drbd_al_ext_cache)
2816 kmem_cache_destroy(drbd_al_ext_cache);
2817
2818 drbd_ee_mempool = NULL;
2819 drbd_request_mempool = NULL;
2820 drbd_ee_cache = NULL;
2821 drbd_request_cache = NULL;
2822 drbd_bm_ext_cache = NULL;
2823 drbd_al_ext_cache = NULL;
2824
2825 return;
2826}
2827
2828static int drbd_create_mempools(void)
2829{
2830 struct page *page;
2831 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2832 int i;
2833
2834 /* prepare our caches and mempools */
2835 drbd_request_mempool = NULL;
2836 drbd_ee_cache = NULL;
2837 drbd_request_cache = NULL;
2838 drbd_bm_ext_cache = NULL;
2839 drbd_al_ext_cache = NULL;
2840 drbd_pp_pool = NULL;
2841
2842 /* caches */
2843 drbd_request_cache = kmem_cache_create(
2844 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2845 if (drbd_request_cache == NULL)
2846 goto Enomem;
2847
2848 drbd_ee_cache = kmem_cache_create(
2849 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2850 if (drbd_ee_cache == NULL)
2851 goto Enomem;
2852
2853 drbd_bm_ext_cache = kmem_cache_create(
2854 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2855 if (drbd_bm_ext_cache == NULL)
2856 goto Enomem;
2857
2858 drbd_al_ext_cache = kmem_cache_create(
2859 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2860 if (drbd_al_ext_cache == NULL)
2861 goto Enomem;
2862
2863 /* mempools */
2864 drbd_request_mempool = mempool_create(number,
2865 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2866 if (drbd_request_mempool == NULL)
2867 goto Enomem;
2868
2869 drbd_ee_mempool = mempool_create(number,
2870 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2871 if (drbd_request_mempool == NULL)
2872 goto Enomem;
2873
2874 /* drbd's page pool */
2875 spin_lock_init(&drbd_pp_lock);
2876
2877 for (i = 0; i < number; i++) {
2878 page = alloc_page(GFP_HIGHUSER);
2879 if (!page)
2880 goto Enomem;
2881 set_page_private(page, (unsigned long)drbd_pp_pool);
2882 drbd_pp_pool = page;
2883 }
2884 drbd_pp_vacant = number;
2885
2886 return 0;
2887
2888Enomem:
2889 drbd_destroy_mempools(); /* in case we allocated some */
2890 return -ENOMEM;
2891}
2892
2893static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2894 void *unused)
2895{
2896 /* just so we have it. you never know what interesting things we
2897 * might want to do here some day...
2898 */
2899
2900 return NOTIFY_DONE;
2901}
2902
2903static struct notifier_block drbd_notifier = {
2904 .notifier_call = drbd_notify_sys,
2905};
2906
2907static void drbd_release_ee_lists(struct drbd_conf *mdev)
2908{
2909 int rr;
2910
2911 rr = drbd_release_ee(mdev, &mdev->active_ee);
2912 if (rr)
2913 dev_err(DEV, "%d EEs in active list found!\n", rr);
2914
2915 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2916 if (rr)
2917 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2918
2919 rr = drbd_release_ee(mdev, &mdev->read_ee);
2920 if (rr)
2921 dev_err(DEV, "%d EEs in read list found!\n", rr);
2922
2923 rr = drbd_release_ee(mdev, &mdev->done_ee);
2924 if (rr)
2925 dev_err(DEV, "%d EEs in done list found!\n", rr);
2926
2927 rr = drbd_release_ee(mdev, &mdev->net_ee);
2928 if (rr)
2929 dev_err(DEV, "%d EEs in net list found!\n", rr);
2930}
2931
2932/* caution. no locking.
2933 * currently only used from module cleanup code. */
2934static void drbd_delete_device(unsigned int minor)
2935{
2936 struct drbd_conf *mdev = minor_to_mdev(minor);
2937
2938 if (!mdev)
2939 return;
2940
2941 /* paranoia asserts */
2942 if (mdev->open_cnt != 0)
2943 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2944 __FILE__ , __LINE__);
2945
2946 ERR_IF (!list_empty(&mdev->data.work.q)) {
2947 struct list_head *lp;
2948 list_for_each(lp, &mdev->data.work.q) {
2949 dev_err(DEV, "lp = %p\n", lp);
2950 }
2951 };
2952 /* end paranoia asserts */
2953
2954 del_gendisk(mdev->vdisk);
2955
2956 /* cleanup stuff that may have been allocated during
2957 * device (re-)configuration or state changes */
2958
2959 if (mdev->this_bdev)
2960 bdput(mdev->this_bdev);
2961
2962 drbd_free_resources(mdev);
2963
2964 drbd_release_ee_lists(mdev);
2965
2966 /* should be free'd on disconnect? */
2967 kfree(mdev->ee_hash);
2968 /*
2969 mdev->ee_hash_s = 0;
2970 mdev->ee_hash = NULL;
2971 */
2972
2973 lc_destroy(mdev->act_log);
2974 lc_destroy(mdev->resync);
2975
2976 kfree(mdev->p_uuid);
2977 /* mdev->p_uuid = NULL; */
2978
2979 kfree(mdev->int_dig_out);
2980 kfree(mdev->int_dig_in);
2981 kfree(mdev->int_dig_vv);
2982
2983 /* cleanup the rest that has been
2984 * allocated from drbd_new_device
2985 * and actually free the mdev itself */
2986 drbd_free_mdev(mdev);
2987}
2988
2989static void drbd_cleanup(void)
2990{
2991 unsigned int i;
2992
2993 unregister_reboot_notifier(&drbd_notifier);
2994
2995 drbd_nl_cleanup();
2996
2997 if (minor_table) {
2998 if (drbd_proc)
2999 remove_proc_entry("drbd", NULL);
3000 i = minor_count;
3001 while (i--)
3002 drbd_delete_device(i);
3003 drbd_destroy_mempools();
3004 }
3005
3006 kfree(minor_table);
3007
3008 unregister_blkdev(DRBD_MAJOR, "drbd");
3009
3010 printk(KERN_INFO "drbd: module cleanup done.\n");
3011}
3012
3013/**
3014 * drbd_congested() - Callback for pdflush
3015 * @congested_data: User data
3016 * @bdi_bits: Bits pdflush is currently interested in
3017 *
3018 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3019 */
3020static int drbd_congested(void *congested_data, int bdi_bits)
3021{
3022 struct drbd_conf *mdev = congested_data;
3023 struct request_queue *q;
3024 char reason = '-';
3025 int r = 0;
3026
3027 if (!__inc_ap_bio_cond(mdev)) {
3028 /* DRBD has frozen IO */
3029 r = bdi_bits;
3030 reason = 'd';
3031 goto out;
3032 }
3033
3034 if (get_ldev(mdev)) {
3035 q = bdev_get_queue(mdev->ldev->backing_bdev);
3036 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3037 put_ldev(mdev);
3038 if (r)
3039 reason = 'b';
3040 }
3041
3042 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3043 r |= (1 << BDI_async_congested);
3044 reason = reason == 'b' ? 'a' : 'n';
3045 }
3046
3047out:
3048 mdev->congestion_reason = reason;
3049 return r;
3050}
3051
3052struct drbd_conf *drbd_new_device(unsigned int minor)
3053{
3054 struct drbd_conf *mdev;
3055 struct gendisk *disk;
3056 struct request_queue *q;
3057
3058 /* GFP_KERNEL, we are outside of all write-out paths */
3059 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3060 if (!mdev)
3061 return NULL;
3062 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3063 goto out_no_cpumask;
3064
3065 mdev->minor = minor;
3066
3067 drbd_init_set_defaults(mdev);
3068
3069 q = blk_alloc_queue(GFP_KERNEL);
3070 if (!q)
3071 goto out_no_q;
3072 mdev->rq_queue = q;
3073 q->queuedata = mdev;
b411b363
PR
3074
3075 disk = alloc_disk(1);
3076 if (!disk)
3077 goto out_no_disk;
3078 mdev->vdisk = disk;
3079
3080 set_disk_ro(disk, TRUE);
3081
3082 disk->queue = q;
3083 disk->major = DRBD_MAJOR;
3084 disk->first_minor = minor;
3085 disk->fops = &drbd_ops;
3086 sprintf(disk->disk_name, "drbd%d", minor);
3087 disk->private_data = mdev;
3088
3089 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3090 /* we have no partitions. we contain only ourselves. */
3091 mdev->this_bdev->bd_contains = mdev->this_bdev;
3092
3093 q->backing_dev_info.congested_fn = drbd_congested;
3094 q->backing_dev_info.congested_data = mdev;
3095
3096 blk_queue_make_request(q, drbd_make_request_26);
98ec286e 3097 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
b411b363
PR
3098 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3099 blk_queue_merge_bvec(q, drbd_merge_bvec);
3100 q->queue_lock = &mdev->req_lock; /* needed since we use */
3101 /* plugging on a queue, that actually has no requests! */
3102 q->unplug_fn = drbd_unplug_fn;
3103
3104 mdev->md_io_page = alloc_page(GFP_KERNEL);
3105 if (!mdev->md_io_page)
3106 goto out_no_io_page;
3107
3108 if (drbd_bm_init(mdev))
3109 goto out_no_bitmap;
3110 /* no need to lock access, we are still initializing this minor device. */
3111 if (!tl_init(mdev))
3112 goto out_no_tl;
3113
3114 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3115 if (!mdev->app_reads_hash)
3116 goto out_no_app_reads;
3117
3118 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3119 if (!mdev->current_epoch)
3120 goto out_no_epoch;
3121
3122 INIT_LIST_HEAD(&mdev->current_epoch->list);
3123 mdev->epochs = 1;
3124
3125 return mdev;
3126
3127/* out_whatever_else:
3128 kfree(mdev->current_epoch); */
3129out_no_epoch:
3130 kfree(mdev->app_reads_hash);
3131out_no_app_reads:
3132 tl_cleanup(mdev);
3133out_no_tl:
3134 drbd_bm_cleanup(mdev);
3135out_no_bitmap:
3136 __free_page(mdev->md_io_page);
3137out_no_io_page:
3138 put_disk(disk);
3139out_no_disk:
3140 blk_cleanup_queue(q);
3141out_no_q:
3142 free_cpumask_var(mdev->cpu_mask);
3143out_no_cpumask:
3144 kfree(mdev);
3145 return NULL;
3146}
3147
3148/* counterpart of drbd_new_device.
3149 * last part of drbd_delete_device. */
3150void drbd_free_mdev(struct drbd_conf *mdev)
3151{
3152 kfree(mdev->current_epoch);
3153 kfree(mdev->app_reads_hash);
3154 tl_cleanup(mdev);
3155 if (mdev->bitmap) /* should no longer be there. */
3156 drbd_bm_cleanup(mdev);
3157 __free_page(mdev->md_io_page);
3158 put_disk(mdev->vdisk);
3159 blk_cleanup_queue(mdev->rq_queue);
3160 free_cpumask_var(mdev->cpu_mask);
3161 kfree(mdev);
3162}
3163
3164
3165int __init drbd_init(void)
3166{
3167 int err;
3168
3169 if (sizeof(struct p_handshake) != 80) {
3170 printk(KERN_ERR
3171 "drbd: never change the size or layout "
3172 "of the HandShake packet.\n");
3173 return -EINVAL;
3174 }
3175
3176 if (1 > minor_count || minor_count > 255) {
3177 printk(KERN_ERR
3178 "drbd: invalid minor_count (%d)\n", minor_count);
3179#ifdef MODULE
3180 return -EINVAL;
3181#else
3182 minor_count = 8;
3183#endif
3184 }
3185
3186 err = drbd_nl_init();
3187 if (err)
3188 return err;
3189
3190 err = register_blkdev(DRBD_MAJOR, "drbd");
3191 if (err) {
3192 printk(KERN_ERR
3193 "drbd: unable to register block device major %d\n",
3194 DRBD_MAJOR);
3195 return err;
3196 }
3197
3198 register_reboot_notifier(&drbd_notifier);
3199
3200 /*
3201 * allocate all necessary structs
3202 */
3203 err = -ENOMEM;
3204
3205 init_waitqueue_head(&drbd_pp_wait);
3206
3207 drbd_proc = NULL; /* play safe for drbd_cleanup */
3208 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3209 GFP_KERNEL);
3210 if (!minor_table)
3211 goto Enomem;
3212
3213 err = drbd_create_mempools();
3214 if (err)
3215 goto Enomem;
3216
8c484ee4 3217 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
3218 if (!drbd_proc) {
3219 printk(KERN_ERR "drbd: unable to register proc file\n");
3220 goto Enomem;
3221 }
3222
3223 rwlock_init(&global_state_lock);
3224
3225 printk(KERN_INFO "drbd: initialized. "
3226 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3227 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3228 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3229 printk(KERN_INFO "drbd: registered as block device major %d\n",
3230 DRBD_MAJOR);
3231 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3232
3233 return 0; /* Success! */
3234
3235Enomem:
3236 drbd_cleanup();
3237 if (err == -ENOMEM)
3238 /* currently always the case */
3239 printk(KERN_ERR "drbd: ran out of memory\n");
3240 else
3241 printk(KERN_ERR "drbd: initialization failure\n");
3242 return err;
3243}
3244
3245void drbd_free_bc(struct drbd_backing_dev *ldev)
3246{
3247 if (ldev == NULL)
3248 return;
3249
3250 bd_release(ldev->backing_bdev);
3251 bd_release(ldev->md_bdev);
3252
3253 fput(ldev->lo_file);
3254 fput(ldev->md_file);
3255
3256 kfree(ldev);
3257}
3258
3259void drbd_free_sock(struct drbd_conf *mdev)
3260{
3261 if (mdev->data.socket) {
4589d7f8 3262 mutex_lock(&mdev->data.mutex);
b411b363
PR
3263 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3264 sock_release(mdev->data.socket);
3265 mdev->data.socket = NULL;
4589d7f8 3266 mutex_unlock(&mdev->data.mutex);
b411b363
PR
3267 }
3268 if (mdev->meta.socket) {
4589d7f8 3269 mutex_lock(&mdev->meta.mutex);
b411b363
PR
3270 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3271 sock_release(mdev->meta.socket);
3272 mdev->meta.socket = NULL;
4589d7f8 3273 mutex_unlock(&mdev->meta.mutex);
b411b363
PR
3274 }
3275}
3276
3277
3278void drbd_free_resources(struct drbd_conf *mdev)
3279{
3280 crypto_free_hash(mdev->csums_tfm);
3281 mdev->csums_tfm = NULL;
3282 crypto_free_hash(mdev->verify_tfm);
3283 mdev->verify_tfm = NULL;
3284 crypto_free_hash(mdev->cram_hmac_tfm);
3285 mdev->cram_hmac_tfm = NULL;
3286 crypto_free_hash(mdev->integrity_w_tfm);
3287 mdev->integrity_w_tfm = NULL;
3288 crypto_free_hash(mdev->integrity_r_tfm);
3289 mdev->integrity_r_tfm = NULL;
3290
3291 drbd_free_sock(mdev);
3292
3293 __no_warn(local,
3294 drbd_free_bc(mdev->ldev);
3295 mdev->ldev = NULL;);
3296}
3297
3298/* meta data management */
3299
3300struct meta_data_on_disk {
3301 u64 la_size; /* last agreed size. */
3302 u64 uuid[UI_SIZE]; /* UUIDs. */
3303 u64 device_uuid;
3304 u64 reserved_u64_1;
3305 u32 flags; /* MDF */
3306 u32 magic;
3307 u32 md_size_sect;
3308 u32 al_offset; /* offset to this block */
3309 u32 al_nr_extents; /* important for restoring the AL */
3310 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3311 u32 bm_offset; /* offset to the bitmap, from here */
3312 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3313 u32 reserved_u32[4];
3314
3315} __packed;
3316
3317/**
3318 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3319 * @mdev: DRBD device.
3320 */
3321void drbd_md_sync(struct drbd_conf *mdev)
3322{
3323 struct meta_data_on_disk *buffer;
3324 sector_t sector;
3325 int i;
3326
3327 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3328 return;
3329 del_timer(&mdev->md_sync_timer);
3330
3331 /* We use here D_FAILED and not D_ATTACHING because we try to write
3332 * metadata even if we detach due to a disk failure! */
3333 if (!get_ldev_if_state(mdev, D_FAILED))
3334 return;
3335
b411b363
PR
3336 mutex_lock(&mdev->md_io_mutex);
3337 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3338 memset(buffer, 0, 512);
3339
3340 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3341 for (i = UI_CURRENT; i < UI_SIZE; i++)
3342 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3343 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3344 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3345
3346 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3347 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3348 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3349 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3350 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3351
3352 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3353
3354 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3355 sector = mdev->ldev->md.md_offset;
3356
3357 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3358 clear_bit(MD_DIRTY, &mdev->flags);
3359 } else {
3360 /* this was a try anyways ... */
3361 dev_err(DEV, "meta data update failed!\n");
3362
3363 drbd_chk_io_error(mdev, 1, TRUE);
3364 }
3365
3366 /* Update mdev->ldev->md.la_size_sect,
3367 * since we updated it on metadata. */
3368 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3369
3370 mutex_unlock(&mdev->md_io_mutex);
3371 put_ldev(mdev);
3372}
3373
3374/**
3375 * drbd_md_read() - Reads in the meta data super block
3376 * @mdev: DRBD device.
3377 * @bdev: Device from which the meta data should be read in.
3378 *
3379 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3380 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3381 */
3382int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3383{
3384 struct meta_data_on_disk *buffer;
3385 int i, rv = NO_ERROR;
3386
3387 if (!get_ldev_if_state(mdev, D_ATTACHING))
3388 return ERR_IO_MD_DISK;
3389
b411b363
PR
3390 mutex_lock(&mdev->md_io_mutex);
3391 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3392
3393 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3394 /* NOTE: cant do normal error processing here as this is
3395 called BEFORE disk is attached */
3396 dev_err(DEV, "Error while reading metadata.\n");
3397 rv = ERR_IO_MD_DISK;
3398 goto err;
3399 }
3400
3401 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3402 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3403 rv = ERR_MD_INVALID;
3404 goto err;
3405 }
3406 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3407 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3408 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3409 rv = ERR_MD_INVALID;
3410 goto err;
3411 }
3412 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3413 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3414 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3415 rv = ERR_MD_INVALID;
3416 goto err;
3417 }
3418 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3419 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3420 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3421 rv = ERR_MD_INVALID;
3422 goto err;
3423 }
3424
3425 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3426 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3427 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3428 rv = ERR_MD_INVALID;
3429 goto err;
3430 }
3431
3432 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3433 for (i = UI_CURRENT; i < UI_SIZE; i++)
3434 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3435 bdev->md.flags = be32_to_cpu(buffer->flags);
3436 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3437 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3438
3439 if (mdev->sync_conf.al_extents < 7)
3440 mdev->sync_conf.al_extents = 127;
3441
3442 err:
3443 mutex_unlock(&mdev->md_io_mutex);
3444 put_ldev(mdev);
3445
3446 return rv;
3447}
3448
3449/**
3450 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3451 * @mdev: DRBD device.
3452 *
3453 * Call this function if you change anything that should be written to
3454 * the meta-data super block. This function sets MD_DIRTY, and starts a
3455 * timer that ensures that within five seconds you have to call drbd_md_sync().
3456 */
3457void drbd_md_mark_dirty(struct drbd_conf *mdev)
3458{
3459 set_bit(MD_DIRTY, &mdev->flags);
3460 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3461}
3462
3463
3464static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3465{
3466 int i;
3467
6a0afdf5 3468 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 3469 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
3470}
3471
3472void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3473{
3474 if (idx == UI_CURRENT) {
3475 if (mdev->state.role == R_PRIMARY)
3476 val |= 1;
3477 else
3478 val &= ~((u64)1);
3479
3480 drbd_set_ed_uuid(mdev, val);
3481 }
3482
3483 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
3484 drbd_md_mark_dirty(mdev);
3485}
3486
3487
3488void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3489{
3490 if (mdev->ldev->md.uuid[idx]) {
3491 drbd_uuid_move_history(mdev);
3492 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
3493 }
3494 _drbd_uuid_set(mdev, idx, val);
3495}
3496
3497/**
3498 * drbd_uuid_new_current() - Creates a new current UUID
3499 * @mdev: DRBD device.
3500 *
3501 * Creates a new current UUID, and rotates the old current UUID into
3502 * the bitmap slot. Causes an incremental resync upon next connect.
3503 */
3504void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3505{
3506 u64 val;
3507
3508 dev_info(DEV, "Creating new current UUID\n");
3509 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3510 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
3511
3512 get_random_bytes(&val, sizeof(u64));
3513 _drbd_uuid_set(mdev, UI_CURRENT, val);
3514}
3515
3516void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3517{
3518 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3519 return;
3520
3521 if (val == 0) {
3522 drbd_uuid_move_history(mdev);
3523 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3524 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363
PR
3525 } else {
3526 if (mdev->ldev->md.uuid[UI_BITMAP])
3527 dev_warn(DEV, "bm UUID already set");
3528
3529 mdev->ldev->md.uuid[UI_BITMAP] = val;
3530 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3531
b411b363
PR
3532 }
3533 drbd_md_mark_dirty(mdev);
3534}
3535
3536/**
3537 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3538 * @mdev: DRBD device.
3539 *
3540 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3541 */
3542int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3543{
3544 int rv = -EIO;
3545
3546 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3547 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3548 drbd_md_sync(mdev);
3549 drbd_bm_set_all(mdev);
3550
3551 rv = drbd_bm_write(mdev);
3552
3553 if (!rv) {
3554 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3555 drbd_md_sync(mdev);
3556 }
3557
3558 put_ldev(mdev);
3559 }
3560
3561 return rv;
3562}
3563
3564/**
3565 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3566 * @mdev: DRBD device.
3567 *
3568 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3569 */
3570int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3571{
3572 int rv = -EIO;
3573
3574 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3575 drbd_bm_clear_all(mdev);
3576 rv = drbd_bm_write(mdev);
3577 put_ldev(mdev);
3578 }
3579
3580 return rv;
3581}
3582
3583static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3584{
3585 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3586 int rv;
3587
3588 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3589
3590 drbd_bm_lock(mdev, work->why);
3591 rv = work->io_fn(mdev);
3592 drbd_bm_unlock(mdev);
3593
3594 clear_bit(BITMAP_IO, &mdev->flags);
3595 wake_up(&mdev->misc_wait);
3596
3597 if (work->done)
3598 work->done(mdev, rv);
3599
3600 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3601 work->why = NULL;
3602
3603 return 1;
3604}
3605
3606/**
3607 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3608 * @mdev: DRBD device.
3609 * @io_fn: IO callback to be called when bitmap IO is possible
3610 * @done: callback to be called after the bitmap IO was performed
3611 * @why: Descriptive text of the reason for doing the IO
3612 *
3613 * While IO on the bitmap happens we freeze application IO thus we ensure
3614 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3615 * called from worker context. It MUST NOT be used while a previous such
3616 * work is still pending!
3617 */
3618void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3619 int (*io_fn)(struct drbd_conf *),
3620 void (*done)(struct drbd_conf *, int),
3621 char *why)
3622{
3623 D_ASSERT(current == mdev->worker.task);
3624
3625 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3626 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3627 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3628 if (mdev->bm_io_work.why)
3629 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3630 why, mdev->bm_io_work.why);
3631
3632 mdev->bm_io_work.io_fn = io_fn;
3633 mdev->bm_io_work.done = done;
3634 mdev->bm_io_work.why = why;
3635
3636 set_bit(BITMAP_IO, &mdev->flags);
3637 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3638 if (list_empty(&mdev->bm_io_work.w.list)) {
3639 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3640 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3641 } else
3642 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3643 }
3644}
3645
3646/**
3647 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3648 * @mdev: DRBD device.
3649 * @io_fn: IO callback to be called when bitmap IO is possible
3650 * @why: Descriptive text of the reason for doing the IO
3651 *
3652 * freezes application IO while that the actual IO operations runs. This
3653 * functions MAY NOT be called from worker context.
3654 */
3655int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3656{
3657 int rv;
3658
3659 D_ASSERT(current != mdev->worker.task);
3660
3661 drbd_suspend_io(mdev);
3662
3663 drbd_bm_lock(mdev, why);
3664 rv = io_fn(mdev);
3665 drbd_bm_unlock(mdev);
3666
3667 drbd_resume_io(mdev);
3668
3669 return rv;
3670}
3671
3672void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3673{
3674 if ((mdev->ldev->md.flags & flag) != flag) {
3675 drbd_md_mark_dirty(mdev);
3676 mdev->ldev->md.flags |= flag;
3677 }
3678}
3679
3680void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3681{
3682 if ((mdev->ldev->md.flags & flag) != 0) {
3683 drbd_md_mark_dirty(mdev);
3684 mdev->ldev->md.flags &= ~flag;
3685 }
3686}
3687int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3688{
3689 return (bdev->md.flags & flag) != 0;
3690}
3691
3692static void md_sync_timer_fn(unsigned long data)
3693{
3694 struct drbd_conf *mdev = (struct drbd_conf *) data;
3695
3696 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3697}
3698
3699static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3700{
3701 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3702 drbd_md_sync(mdev);
3703
3704 return 1;
3705}
3706
3707#ifdef CONFIG_DRBD_FAULT_INJECTION
3708/* Fault insertion support including random number generator shamelessly
3709 * stolen from kernel/rcutorture.c */
3710struct fault_random_state {
3711 unsigned long state;
3712 unsigned long count;
3713};
3714
3715#define FAULT_RANDOM_MULT 39916801 /* prime */
3716#define FAULT_RANDOM_ADD 479001701 /* prime */
3717#define FAULT_RANDOM_REFRESH 10000
3718
3719/*
3720 * Crude but fast random-number generator. Uses a linear congruential
3721 * generator, with occasional help from get_random_bytes().
3722 */
3723static unsigned long
3724_drbd_fault_random(struct fault_random_state *rsp)
3725{
3726 long refresh;
3727
49829ea7 3728 if (!rsp->count--) {
b411b363
PR
3729 get_random_bytes(&refresh, sizeof(refresh));
3730 rsp->state += refresh;
3731 rsp->count = FAULT_RANDOM_REFRESH;
3732 }
3733 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3734 return swahw32(rsp->state);
3735}
3736
3737static char *
3738_drbd_fault_str(unsigned int type) {
3739 static char *_faults[] = {
3740 [DRBD_FAULT_MD_WR] = "Meta-data write",
3741 [DRBD_FAULT_MD_RD] = "Meta-data read",
3742 [DRBD_FAULT_RS_WR] = "Resync write",
3743 [DRBD_FAULT_RS_RD] = "Resync read",
3744 [DRBD_FAULT_DT_WR] = "Data write",
3745 [DRBD_FAULT_DT_RD] = "Data read",
3746 [DRBD_FAULT_DT_RA] = "Data read ahead",
3747 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3748 [DRBD_FAULT_AL_EE] = "EE allocation",
3749 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3750 };
3751
3752 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3753}
3754
3755unsigned int
3756_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3757{
3758 static struct fault_random_state rrs = {0, 0};
3759
3760 unsigned int ret = (
3761 (fault_devs == 0 ||
3762 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3763 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3764
3765 if (ret) {
3766 fault_count++;
3767
3768 if (printk_ratelimit())
3769 dev_warn(DEV, "***Simulating %s failure\n",
3770 _drbd_fault_str(type));
3771 }
3772
3773 return ret;
3774}
3775#endif
3776
3777const char *drbd_buildtag(void)
3778{
3779 /* DRBD built from external sources has here a reference to the
3780 git hash of the source code. */
3781
3782 static char buildtag[38] = "\0uilt-in";
3783
3784 if (buildtag[0] == 0) {
3785#ifdef CONFIG_MODULES
3786 if (THIS_MODULE != NULL)
3787 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3788 else
3789#endif
3790 buildtag[0] = 'b';
3791 }
3792
3793 return buildtag;
3794}
3795
3796module_init(drbd_init)
3797module_exit(drbd_cleanup)
3798
b411b363
PR
3799EXPORT_SYMBOL(drbd_conn_str);
3800EXPORT_SYMBOL(drbd_role_str);
3801EXPORT_SYMBOL(drbd_disk_str);
3802EXPORT_SYMBOL(drbd_set_st_err_str);