]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/ipvs/ip_vs_ctl.c
[INET_SOCK]: Move struct inet_sock & helper functions to net/inet_sock.h
[net-next-2.6.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/fs.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/workqueue.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
33
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36
37 #include <net/ip.h>
38 #include <net/route.h>
39 #include <net/sock.h>
40
41 #include <asm/uaccess.h>
42
43 #include <net/ip_vs.h>
44
45 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
46 static DECLARE_MUTEX(__ip_vs_mutex);
47
48 /* lock for service table */
49 static DEFINE_RWLOCK(__ip_vs_svc_lock);
50
51 /* lock for table with the real services */
52 static DEFINE_RWLOCK(__ip_vs_rs_lock);
53
54 /* lock for state and timeout tables */
55 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
56
57 /* lock for drop entry handling */
58 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
59
60 /* lock for drop packet handling */
61 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
62
63 /* 1/rate drop and drop-entry variables */
64 int ip_vs_drop_rate = 0;
65 int ip_vs_drop_counter = 0;
66 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67
68 /* number of virtual services */
69 static int ip_vs_num_services = 0;
70
71 /* sysctl variables */
72 static int sysctl_ip_vs_drop_entry = 0;
73 static int sysctl_ip_vs_drop_packet = 0;
74 static int sysctl_ip_vs_secure_tcp = 0;
75 static int sysctl_ip_vs_amemthresh = 1024;
76 static int sysctl_ip_vs_am_droprate = 10;
77 int sysctl_ip_vs_cache_bypass = 0;
78 int sysctl_ip_vs_expire_nodest_conn = 0;
79 int sysctl_ip_vs_expire_quiescent_template = 0;
80 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
81 int sysctl_ip_vs_nat_icmp_send = 0;
82
83
84 #ifdef CONFIG_IP_VS_DEBUG
85 static int sysctl_ip_vs_debug_level = 0;
86
87 int ip_vs_get_debug_level(void)
88 {
89         return sysctl_ip_vs_debug_level;
90 }
91 #endif
92
93 /*
94  *      update_defense_level is called from keventd and from sysctl,
95  *      so it needs to protect itself from softirqs
96  */
97 static void update_defense_level(void)
98 {
99         struct sysinfo i;
100         static int old_secure_tcp = 0;
101         int availmem;
102         int nomem;
103         int to_change = -1;
104
105         /* we only count free and buffered memory (in pages) */
106         si_meminfo(&i);
107         availmem = i.freeram + i.bufferram;
108         /* however in linux 2.5 the i.bufferram is total page cache size,
109            we need adjust it */
110         /* si_swapinfo(&i); */
111         /* availmem = availmem - (i.totalswap - i.freeswap); */
112
113         nomem = (availmem < sysctl_ip_vs_amemthresh);
114
115         local_bh_disable();
116
117         /* drop_entry */
118         spin_lock(&__ip_vs_dropentry_lock);
119         switch (sysctl_ip_vs_drop_entry) {
120         case 0:
121                 atomic_set(&ip_vs_dropentry, 0);
122                 break;
123         case 1:
124                 if (nomem) {
125                         atomic_set(&ip_vs_dropentry, 1);
126                         sysctl_ip_vs_drop_entry = 2;
127                 } else {
128                         atomic_set(&ip_vs_dropentry, 0);
129                 }
130                 break;
131         case 2:
132                 if (nomem) {
133                         atomic_set(&ip_vs_dropentry, 1);
134                 } else {
135                         atomic_set(&ip_vs_dropentry, 0);
136                         sysctl_ip_vs_drop_entry = 1;
137                 };
138                 break;
139         case 3:
140                 atomic_set(&ip_vs_dropentry, 1);
141                 break;
142         }
143         spin_unlock(&__ip_vs_dropentry_lock);
144
145         /* drop_packet */
146         spin_lock(&__ip_vs_droppacket_lock);
147         switch (sysctl_ip_vs_drop_packet) {
148         case 0:
149                 ip_vs_drop_rate = 0;
150                 break;
151         case 1:
152                 if (nomem) {
153                         ip_vs_drop_rate = ip_vs_drop_counter
154                                 = sysctl_ip_vs_amemthresh /
155                                 (sysctl_ip_vs_amemthresh-availmem);
156                         sysctl_ip_vs_drop_packet = 2;
157                 } else {
158                         ip_vs_drop_rate = 0;
159                 }
160                 break;
161         case 2:
162                 if (nomem) {
163                         ip_vs_drop_rate = ip_vs_drop_counter
164                                 = sysctl_ip_vs_amemthresh /
165                                 (sysctl_ip_vs_amemthresh-availmem);
166                 } else {
167                         ip_vs_drop_rate = 0;
168                         sysctl_ip_vs_drop_packet = 1;
169                 }
170                 break;
171         case 3:
172                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
173                 break;
174         }
175         spin_unlock(&__ip_vs_droppacket_lock);
176
177         /* secure_tcp */
178         write_lock(&__ip_vs_securetcp_lock);
179         switch (sysctl_ip_vs_secure_tcp) {
180         case 0:
181                 if (old_secure_tcp >= 2)
182                         to_change = 0;
183                 break;
184         case 1:
185                 if (nomem) {
186                         if (old_secure_tcp < 2)
187                                 to_change = 1;
188                         sysctl_ip_vs_secure_tcp = 2;
189                 } else {
190                         if (old_secure_tcp >= 2)
191                                 to_change = 0;
192                 }
193                 break;
194         case 2:
195                 if (nomem) {
196                         if (old_secure_tcp < 2)
197                                 to_change = 1;
198                 } else {
199                         if (old_secure_tcp >= 2)
200                                 to_change = 0;
201                         sysctl_ip_vs_secure_tcp = 1;
202                 }
203                 break;
204         case 3:
205                 if (old_secure_tcp < 2)
206                         to_change = 1;
207                 break;
208         }
209         old_secure_tcp = sysctl_ip_vs_secure_tcp;
210         if (to_change >= 0)
211                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
212         write_unlock(&__ip_vs_securetcp_lock);
213
214         local_bh_enable();
215 }
216
217
218 /*
219  *      Timer for checking the defense
220  */
221 #define DEFENSE_TIMER_PERIOD    1*HZ
222 static void defense_work_handler(void *data);
223 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
224
225 static void defense_work_handler(void *data)
226 {
227         update_defense_level();
228         if (atomic_read(&ip_vs_dropentry))
229                 ip_vs_random_dropentry();
230
231         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
232 }
233
234 int
235 ip_vs_use_count_inc(void)
236 {
237         return try_module_get(THIS_MODULE);
238 }
239
240 void
241 ip_vs_use_count_dec(void)
242 {
243         module_put(THIS_MODULE);
244 }
245
246
247 /*
248  *      Hash table: for virtual service lookups
249  */
250 #define IP_VS_SVC_TAB_BITS 8
251 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
252 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
253
254 /* the service table hashed by <protocol, addr, port> */
255 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
256 /* the service table hashed by fwmark */
257 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
258
259 /*
260  *      Hash table: for real service lookups
261  */
262 #define IP_VS_RTAB_BITS 4
263 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
264 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
265
266 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
267
268 /*
269  *      Trash for destinations
270  */
271 static LIST_HEAD(ip_vs_dest_trash);
272
273 /*
274  *      FTP & NULL virtual service counters
275  */
276 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
277 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
278
279
280 /*
281  *      Returns hash value for virtual service
282  */
283 static __inline__ unsigned
284 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
285 {
286         register unsigned porth = ntohs(port);
287
288         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
289                 & IP_VS_SVC_TAB_MASK;
290 }
291
292 /*
293  *      Returns hash value of fwmark for virtual service lookup
294  */
295 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
296 {
297         return fwmark & IP_VS_SVC_TAB_MASK;
298 }
299
300 /*
301  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
302  *      or in the ip_vs_svc_fwm_table by fwmark.
303  *      Should be called with locked tables.
304  */
305 static int ip_vs_svc_hash(struct ip_vs_service *svc)
306 {
307         unsigned hash;
308
309         if (svc->flags & IP_VS_SVC_F_HASHED) {
310                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
311                           "called from %p\n", __builtin_return_address(0));
312                 return 0;
313         }
314
315         if (svc->fwmark == 0) {
316                 /*
317                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
318                  */
319                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
320                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
321         } else {
322                 /*
323                  *  Hash it by fwmark in ip_vs_svc_fwm_table
324                  */
325                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
326                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
327         }
328
329         svc->flags |= IP_VS_SVC_F_HASHED;
330         /* increase its refcnt because it is referenced by the svc table */
331         atomic_inc(&svc->refcnt);
332         return 1;
333 }
334
335
336 /*
337  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
338  *      Should be called with locked tables.
339  */
340 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
341 {
342         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
343                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
344                           "called from %p\n", __builtin_return_address(0));
345                 return 0;
346         }
347
348         if (svc->fwmark == 0) {
349                 /* Remove it from the ip_vs_svc_table table */
350                 list_del(&svc->s_list);
351         } else {
352                 /* Remove it from the ip_vs_svc_fwm_table table */
353                 list_del(&svc->f_list);
354         }
355
356         svc->flags &= ~IP_VS_SVC_F_HASHED;
357         atomic_dec(&svc->refcnt);
358         return 1;
359 }
360
361
362 /*
363  *      Get service by {proto,addr,port} in the service table.
364  */
365 static __inline__ struct ip_vs_service *
366 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
367 {
368         unsigned hash;
369         struct ip_vs_service *svc;
370
371         /* Check for "full" addressed entries */
372         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
373
374         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
375                 if ((svc->addr == vaddr)
376                     && (svc->port == vport)
377                     && (svc->protocol == protocol)) {
378                         /* HIT */
379                         atomic_inc(&svc->usecnt);
380                         return svc;
381                 }
382         }
383
384         return NULL;
385 }
386
387
388 /*
389  *      Get service by {fwmark} in the service table.
390  */
391 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
392 {
393         unsigned hash;
394         struct ip_vs_service *svc;
395
396         /* Check for fwmark addressed entries */
397         hash = ip_vs_svc_fwm_hashkey(fwmark);
398
399         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400                 if (svc->fwmark == fwmark) {
401                         /* HIT */
402                         atomic_inc(&svc->usecnt);
403                         return svc;
404                 }
405         }
406
407         return NULL;
408 }
409
410 struct ip_vs_service *
411 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
412 {
413         struct ip_vs_service *svc;
414
415         read_lock(&__ip_vs_svc_lock);
416
417         /*
418          *      Check the table hashed by fwmark first
419          */
420         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
421                 goto out;
422
423         /*
424          *      Check the table hashed by <protocol,addr,port>
425          *      for "full" addressed entries
426          */
427         svc = __ip_vs_service_get(protocol, vaddr, vport);
428
429         if (svc == NULL
430             && protocol == IPPROTO_TCP
431             && atomic_read(&ip_vs_ftpsvc_counter)
432             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433                 /*
434                  * Check if ftp service entry exists, the packet
435                  * might belong to FTP data connections.
436                  */
437                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
438         }
439
440         if (svc == NULL
441             && atomic_read(&ip_vs_nullsvc_counter)) {
442                 /*
443                  * Check if the catch-all port (port zero) exists
444                  */
445                 svc = __ip_vs_service_get(protocol, vaddr, 0);
446         }
447
448   out:
449         read_unlock(&__ip_vs_svc_lock);
450
451         IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
452                   fwmark, ip_vs_proto_name(protocol),
453                   NIPQUAD(vaddr), ntohs(vport),
454                   svc?"hit":"not hit");
455
456         return svc;
457 }
458
459
460 static inline void
461 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462 {
463         atomic_inc(&svc->refcnt);
464         dest->svc = svc;
465 }
466
467 static inline void
468 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
469 {
470         struct ip_vs_service *svc = dest->svc;
471
472         dest->svc = NULL;
473         if (atomic_dec_and_test(&svc->refcnt))
474                 kfree(svc);
475 }
476
477
478 /*
479  *      Returns hash value for real service
480  */
481 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
482 {
483         register unsigned porth = ntohs(port);
484
485         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
486                 & IP_VS_RTAB_MASK;
487 }
488
489 /*
490  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
491  *      should be called with locked tables.
492  */
493 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
494 {
495         unsigned hash;
496
497         if (!list_empty(&dest->d_list)) {
498                 return 0;
499         }
500
501         /*
502          *      Hash by proto,addr,port,
503          *      which are the parameters of the real service.
504          */
505         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
506         list_add(&dest->d_list, &ip_vs_rtable[hash]);
507
508         return 1;
509 }
510
511 /*
512  *      UNhashes ip_vs_dest from ip_vs_rtable.
513  *      should be called with locked tables.
514  */
515 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
516 {
517         /*
518          * Remove it from the ip_vs_rtable table.
519          */
520         if (!list_empty(&dest->d_list)) {
521                 list_del(&dest->d_list);
522                 INIT_LIST_HEAD(&dest->d_list);
523         }
524
525         return 1;
526 }
527
528 /*
529  *      Lookup real service by <proto,addr,port> in the real service table.
530  */
531 struct ip_vs_dest *
532 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
533 {
534         unsigned hash;
535         struct ip_vs_dest *dest;
536
537         /*
538          *      Check for "full" addressed entries
539          *      Return the first found entry
540          */
541         hash = ip_vs_rs_hashkey(daddr, dport);
542
543         read_lock(&__ip_vs_rs_lock);
544         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
545                 if ((dest->addr == daddr)
546                     && (dest->port == dport)
547                     && ((dest->protocol == protocol) ||
548                         dest->vfwmark)) {
549                         /* HIT */
550                         read_unlock(&__ip_vs_rs_lock);
551                         return dest;
552                 }
553         }
554         read_unlock(&__ip_vs_rs_lock);
555
556         return NULL;
557 }
558
559 /*
560  *      Lookup destination by {addr,port} in the given service
561  */
562 static struct ip_vs_dest *
563 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
564 {
565         struct ip_vs_dest *dest;
566
567         /*
568          * Find the destination for the given service
569          */
570         list_for_each_entry(dest, &svc->destinations, n_list) {
571                 if ((dest->addr == daddr) && (dest->port == dport)) {
572                         /* HIT */
573                         return dest;
574                 }
575         }
576
577         return NULL;
578 }
579
580
581 /*
582  *  Lookup dest by {svc,addr,port} in the destination trash.
583  *  The destination trash is used to hold the destinations that are removed
584  *  from the service table but are still referenced by some conn entries.
585  *  The reason to add the destination trash is when the dest is temporary
586  *  down (either by administrator or by monitor program), the dest can be
587  *  picked back from the trash, the remaining connections to the dest can
588  *  continue, and the counting information of the dest is also useful for
589  *  scheduling.
590  */
591 static struct ip_vs_dest *
592 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
593 {
594         struct ip_vs_dest *dest, *nxt;
595
596         /*
597          * Find the destination in trash
598          */
599         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
600                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
601                           "refcnt=%d\n",
602                           dest->vfwmark,
603                           NIPQUAD(dest->addr), ntohs(dest->port),
604                           atomic_read(&dest->refcnt));
605                 if (dest->addr == daddr &&
606                     dest->port == dport &&
607                     dest->vfwmark == svc->fwmark &&
608                     dest->protocol == svc->protocol &&
609                     (svc->fwmark ||
610                      (dest->vaddr == svc->addr &&
611                       dest->vport == svc->port))) {
612                         /* HIT */
613                         return dest;
614                 }
615
616                 /*
617                  * Try to purge the destination from trash if not referenced
618                  */
619                 if (atomic_read(&dest->refcnt) == 1) {
620                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
621                                   "from trash\n",
622                                   dest->vfwmark,
623                                   NIPQUAD(dest->addr), ntohs(dest->port));
624                         list_del(&dest->n_list);
625                         ip_vs_dst_reset(dest);
626                         __ip_vs_unbind_svc(dest);
627                         kfree(dest);
628                 }
629         }
630
631         return NULL;
632 }
633
634
635 /*
636  *  Clean up all the destinations in the trash
637  *  Called by the ip_vs_control_cleanup()
638  *
639  *  When the ip_vs_control_clearup is activated by ipvs module exit,
640  *  the service tables must have been flushed and all the connections
641  *  are expired, and the refcnt of each destination in the trash must
642  *  be 1, so we simply release them here.
643  */
644 static void ip_vs_trash_cleanup(void)
645 {
646         struct ip_vs_dest *dest, *nxt;
647
648         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
649                 list_del(&dest->n_list);
650                 ip_vs_dst_reset(dest);
651                 __ip_vs_unbind_svc(dest);
652                 kfree(dest);
653         }
654 }
655
656
657 static void
658 ip_vs_zero_stats(struct ip_vs_stats *stats)
659 {
660         spin_lock_bh(&stats->lock);
661         memset(stats, 0, (char *)&stats->lock - (char *)stats);
662         spin_unlock_bh(&stats->lock);
663         ip_vs_zero_estimator(stats);
664 }
665
666 /*
667  *      Update a destination in the given service
668  */
669 static void
670 __ip_vs_update_dest(struct ip_vs_service *svc,
671                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
672 {
673         int conn_flags;
674
675         /* set the weight and the flags */
676         atomic_set(&dest->weight, udest->weight);
677         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
678
679         /* check if local node and update the flags */
680         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
681                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
682                         | IP_VS_CONN_F_LOCALNODE;
683         }
684
685         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
686         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
687                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
688         } else {
689                 /*
690                  *    Put the real service in ip_vs_rtable if not present.
691                  *    For now only for NAT!
692                  */
693                 write_lock_bh(&__ip_vs_rs_lock);
694                 ip_vs_rs_hash(dest);
695                 write_unlock_bh(&__ip_vs_rs_lock);
696         }
697         atomic_set(&dest->conn_flags, conn_flags);
698
699         /* bind the service */
700         if (!dest->svc) {
701                 __ip_vs_bind_svc(dest, svc);
702         } else {
703                 if (dest->svc != svc) {
704                         __ip_vs_unbind_svc(dest);
705                         ip_vs_zero_stats(&dest->stats);
706                         __ip_vs_bind_svc(dest, svc);
707                 }
708         }
709
710         /* set the dest status flags */
711         dest->flags |= IP_VS_DEST_F_AVAILABLE;
712
713         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
714                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
715         dest->u_threshold = udest->u_threshold;
716         dest->l_threshold = udest->l_threshold;
717 }
718
719
720 /*
721  *      Create a destination for the given service
722  */
723 static int
724 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
725                struct ip_vs_dest **dest_p)
726 {
727         struct ip_vs_dest *dest;
728         unsigned atype;
729
730         EnterFunction(2);
731
732         atype = inet_addr_type(udest->addr);
733         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
734                 return -EINVAL;
735
736         dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
737         if (dest == NULL) {
738                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
739                 return -ENOMEM;
740         }
741         memset(dest, 0, sizeof(struct ip_vs_dest));
742
743         dest->protocol = svc->protocol;
744         dest->vaddr = svc->addr;
745         dest->vport = svc->port;
746         dest->vfwmark = svc->fwmark;
747         dest->addr = udest->addr;
748         dest->port = udest->port;
749
750         atomic_set(&dest->activeconns, 0);
751         atomic_set(&dest->inactconns, 0);
752         atomic_set(&dest->persistconns, 0);
753         atomic_set(&dest->refcnt, 0);
754
755         INIT_LIST_HEAD(&dest->d_list);
756         spin_lock_init(&dest->dst_lock);
757         spin_lock_init(&dest->stats.lock);
758         __ip_vs_update_dest(svc, dest, udest);
759         ip_vs_new_estimator(&dest->stats);
760
761         *dest_p = dest;
762
763         LeaveFunction(2);
764         return 0;
765 }
766
767
768 /*
769  *      Add a destination into an existing service
770  */
771 static int
772 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
773 {
774         struct ip_vs_dest *dest;
775         __u32 daddr = udest->addr;
776         __u16 dport = udest->port;
777         int ret;
778
779         EnterFunction(2);
780
781         if (udest->weight < 0) {
782                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
783                 return -ERANGE;
784         }
785
786         if (udest->l_threshold > udest->u_threshold) {
787                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
788                           "upper threshold\n");
789                 return -ERANGE;
790         }
791
792         /*
793          * Check if the dest already exists in the list
794          */
795         dest = ip_vs_lookup_dest(svc, daddr, dport);
796         if (dest != NULL) {
797                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
798                 return -EEXIST;
799         }
800
801         /*
802          * Check if the dest already exists in the trash and
803          * is from the same service
804          */
805         dest = ip_vs_trash_get_dest(svc, daddr, dport);
806         if (dest != NULL) {
807                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
808                           "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
809                           NIPQUAD(daddr), ntohs(dport),
810                           atomic_read(&dest->refcnt),
811                           dest->vfwmark,
812                           NIPQUAD(dest->vaddr),
813                           ntohs(dest->vport));
814                 __ip_vs_update_dest(svc, dest, udest);
815
816                 /*
817                  * Get the destination from the trash
818                  */
819                 list_del(&dest->n_list);
820
821                 ip_vs_new_estimator(&dest->stats);
822
823                 write_lock_bh(&__ip_vs_svc_lock);
824
825                 /*
826                  * Wait until all other svc users go away.
827                  */
828                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
829
830                 list_add(&dest->n_list, &svc->destinations);
831                 svc->num_dests++;
832
833                 /* call the update_service function of its scheduler */
834                 svc->scheduler->update_service(svc);
835
836                 write_unlock_bh(&__ip_vs_svc_lock);
837                 return 0;
838         }
839
840         /*
841          * Allocate and initialize the dest structure
842          */
843         ret = ip_vs_new_dest(svc, udest, &dest);
844         if (ret) {
845                 return ret;
846         }
847
848         /*
849          * Add the dest entry into the list
850          */
851         atomic_inc(&dest->refcnt);
852
853         write_lock_bh(&__ip_vs_svc_lock);
854
855         /*
856          * Wait until all other svc users go away.
857          */
858         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
859
860         list_add(&dest->n_list, &svc->destinations);
861         svc->num_dests++;
862
863         /* call the update_service function of its scheduler */
864         svc->scheduler->update_service(svc);
865
866         write_unlock_bh(&__ip_vs_svc_lock);
867
868         LeaveFunction(2);
869
870         return 0;
871 }
872
873
874 /*
875  *      Edit a destination in the given service
876  */
877 static int
878 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
879 {
880         struct ip_vs_dest *dest;
881         __u32 daddr = udest->addr;
882         __u16 dport = udest->port;
883
884         EnterFunction(2);
885
886         if (udest->weight < 0) {
887                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
888                 return -ERANGE;
889         }
890
891         if (udest->l_threshold > udest->u_threshold) {
892                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
893                           "upper threshold\n");
894                 return -ERANGE;
895         }
896
897         /*
898          *  Lookup the destination list
899          */
900         dest = ip_vs_lookup_dest(svc, daddr, dport);
901         if (dest == NULL) {
902                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
903                 return -ENOENT;
904         }
905
906         __ip_vs_update_dest(svc, dest, udest);
907
908         write_lock_bh(&__ip_vs_svc_lock);
909
910         /* Wait until all other svc users go away */
911         while (atomic_read(&svc->usecnt) > 1) {};
912
913         /* call the update_service, because server weight may be changed */
914         svc->scheduler->update_service(svc);
915
916         write_unlock_bh(&__ip_vs_svc_lock);
917
918         LeaveFunction(2);
919
920         return 0;
921 }
922
923
924 /*
925  *      Delete a destination (must be already unlinked from the service)
926  */
927 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
928 {
929         ip_vs_kill_estimator(&dest->stats);
930
931         /*
932          *  Remove it from the d-linked list with the real services.
933          */
934         write_lock_bh(&__ip_vs_rs_lock);
935         ip_vs_rs_unhash(dest);
936         write_unlock_bh(&__ip_vs_rs_lock);
937
938         /*
939          *  Decrease the refcnt of the dest, and free the dest
940          *  if nobody refers to it (refcnt=0). Otherwise, throw
941          *  the destination into the trash.
942          */
943         if (atomic_dec_and_test(&dest->refcnt)) {
944                 ip_vs_dst_reset(dest);
945                 /* simply decrease svc->refcnt here, let the caller check
946                    and release the service if nobody refers to it.
947                    Only user context can release destination and service,
948                    and only one user context can update virtual service at a
949                    time, so the operation here is OK */
950                 atomic_dec(&dest->svc->refcnt);
951                 kfree(dest);
952         } else {
953                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
954                           NIPQUAD(dest->addr), ntohs(dest->port),
955                           atomic_read(&dest->refcnt));
956                 list_add(&dest->n_list, &ip_vs_dest_trash);
957                 atomic_inc(&dest->refcnt);
958         }
959 }
960
961
962 /*
963  *      Unlink a destination from the given service
964  */
965 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
966                                 struct ip_vs_dest *dest,
967                                 int svcupd)
968 {
969         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
970
971         /*
972          *  Remove it from the d-linked destination list.
973          */
974         list_del(&dest->n_list);
975         svc->num_dests--;
976         if (svcupd) {
977                 /*
978                  *  Call the update_service function of its scheduler
979                  */
980                 svc->scheduler->update_service(svc);
981         }
982 }
983
984
985 /*
986  *      Delete a destination server in the given service
987  */
988 static int
989 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
990 {
991         struct ip_vs_dest *dest;
992         __u32 daddr = udest->addr;
993         __u16 dport = udest->port;
994
995         EnterFunction(2);
996
997         dest = ip_vs_lookup_dest(svc, daddr, dport);
998         if (dest == NULL) {
999                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1000                 return -ENOENT;
1001         }
1002
1003         write_lock_bh(&__ip_vs_svc_lock);
1004
1005         /*
1006          *      Wait until all other svc users go away.
1007          */
1008         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1009
1010         /*
1011          *      Unlink dest from the service
1012          */
1013         __ip_vs_unlink_dest(svc, dest, 1);
1014
1015         write_unlock_bh(&__ip_vs_svc_lock);
1016
1017         /*
1018          *      Delete the destination
1019          */
1020         __ip_vs_del_dest(dest);
1021
1022         LeaveFunction(2);
1023
1024         return 0;
1025 }
1026
1027
1028 /*
1029  *      Add a service into the service hash table
1030  */
1031 static int
1032 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1033 {
1034         int ret = 0;
1035         struct ip_vs_scheduler *sched = NULL;
1036         struct ip_vs_service *svc = NULL;
1037
1038         /* increase the module use count */
1039         ip_vs_use_count_inc();
1040
1041         /* Lookup the scheduler by 'u->sched_name' */
1042         sched = ip_vs_scheduler_get(u->sched_name);
1043         if (sched == NULL) {
1044                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1045                            u->sched_name);
1046                 ret = -ENOENT;
1047                 goto out_mod_dec;
1048         }
1049
1050         svc = (struct ip_vs_service *)
1051                 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1052         if (svc == NULL) {
1053                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1054                 ret = -ENOMEM;
1055                 goto out_err;
1056         }
1057         memset(svc, 0, sizeof(struct ip_vs_service));
1058
1059         /* I'm the first user of the service */
1060         atomic_set(&svc->usecnt, 1);
1061         atomic_set(&svc->refcnt, 0);
1062
1063         svc->protocol = u->protocol;
1064         svc->addr = u->addr;
1065         svc->port = u->port;
1066         svc->fwmark = u->fwmark;
1067         svc->flags = u->flags;
1068         svc->timeout = u->timeout * HZ;
1069         svc->netmask = u->netmask;
1070
1071         INIT_LIST_HEAD(&svc->destinations);
1072         rwlock_init(&svc->sched_lock);
1073         spin_lock_init(&svc->stats.lock);
1074
1075         /* Bind the scheduler */
1076         ret = ip_vs_bind_scheduler(svc, sched);
1077         if (ret)
1078                 goto out_err;
1079         sched = NULL;
1080
1081         /* Update the virtual service counters */
1082         if (svc->port == FTPPORT)
1083                 atomic_inc(&ip_vs_ftpsvc_counter);
1084         else if (svc->port == 0)
1085                 atomic_inc(&ip_vs_nullsvc_counter);
1086
1087         ip_vs_new_estimator(&svc->stats);
1088         ip_vs_num_services++;
1089
1090         /* Hash the service into the service table */
1091         write_lock_bh(&__ip_vs_svc_lock);
1092         ip_vs_svc_hash(svc);
1093         write_unlock_bh(&__ip_vs_svc_lock);
1094
1095         *svc_p = svc;
1096         return 0;
1097
1098   out_err:
1099         if (svc != NULL) {
1100                 if (svc->scheduler)
1101                         ip_vs_unbind_scheduler(svc);
1102                 if (svc->inc) {
1103                         local_bh_disable();
1104                         ip_vs_app_inc_put(svc->inc);
1105                         local_bh_enable();
1106                 }
1107                 kfree(svc);
1108         }
1109         ip_vs_scheduler_put(sched);
1110
1111   out_mod_dec:
1112         /* decrease the module use count */
1113         ip_vs_use_count_dec();
1114
1115         return ret;
1116 }
1117
1118
1119 /*
1120  *      Edit a service and bind it with a new scheduler
1121  */
1122 static int
1123 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1124 {
1125         struct ip_vs_scheduler *sched, *old_sched;
1126         int ret = 0;
1127
1128         /*
1129          * Lookup the scheduler, by 'u->sched_name'
1130          */
1131         sched = ip_vs_scheduler_get(u->sched_name);
1132         if (sched == NULL) {
1133                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1134                            u->sched_name);
1135                 return -ENOENT;
1136         }
1137         old_sched = sched;
1138
1139         write_lock_bh(&__ip_vs_svc_lock);
1140
1141         /*
1142          * Wait until all other svc users go away.
1143          */
1144         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1145
1146         /*
1147          * Set the flags and timeout value
1148          */
1149         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1150         svc->timeout = u->timeout * HZ;
1151         svc->netmask = u->netmask;
1152
1153         old_sched = svc->scheduler;
1154         if (sched != old_sched) {
1155                 /*
1156                  * Unbind the old scheduler
1157                  */
1158                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1159                         old_sched = sched;
1160                         goto out;
1161                 }
1162
1163                 /*
1164                  * Bind the new scheduler
1165                  */
1166                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1167                         /*
1168                          * If ip_vs_bind_scheduler fails, restore the old
1169                          * scheduler.
1170                          * The main reason of failure is out of memory.
1171                          *
1172                          * The question is if the old scheduler can be
1173                          * restored all the time. TODO: if it cannot be
1174                          * restored some time, we must delete the service,
1175                          * otherwise the system may crash.
1176                          */
1177                         ip_vs_bind_scheduler(svc, old_sched);
1178                         old_sched = sched;
1179                         goto out;
1180                 }
1181         }
1182
1183   out:
1184         write_unlock_bh(&__ip_vs_svc_lock);
1185
1186         if (old_sched)
1187                 ip_vs_scheduler_put(old_sched);
1188
1189         return ret;
1190 }
1191
1192
1193 /*
1194  *      Delete a service from the service list
1195  *      - The service must be unlinked, unlocked and not referenced!
1196  *      - We are called under _bh lock
1197  */
1198 static void __ip_vs_del_service(struct ip_vs_service *svc)
1199 {
1200         struct ip_vs_dest *dest, *nxt;
1201         struct ip_vs_scheduler *old_sched;
1202
1203         ip_vs_num_services--;
1204         ip_vs_kill_estimator(&svc->stats);
1205
1206         /* Unbind scheduler */
1207         old_sched = svc->scheduler;
1208         ip_vs_unbind_scheduler(svc);
1209         if (old_sched)
1210                 ip_vs_scheduler_put(old_sched);
1211
1212         /* Unbind app inc */
1213         if (svc->inc) {
1214                 ip_vs_app_inc_put(svc->inc);
1215                 svc->inc = NULL;
1216         }
1217
1218         /*
1219          *    Unlink the whole destination list
1220          */
1221         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1222                 __ip_vs_unlink_dest(svc, dest, 0);
1223                 __ip_vs_del_dest(dest);
1224         }
1225
1226         /*
1227          *    Update the virtual service counters
1228          */
1229         if (svc->port == FTPPORT)
1230                 atomic_dec(&ip_vs_ftpsvc_counter);
1231         else if (svc->port == 0)
1232                 atomic_dec(&ip_vs_nullsvc_counter);
1233
1234         /*
1235          *    Free the service if nobody refers to it
1236          */
1237         if (atomic_read(&svc->refcnt) == 0)
1238                 kfree(svc);
1239
1240         /* decrease the module use count */
1241         ip_vs_use_count_dec();
1242 }
1243
1244 /*
1245  *      Delete a service from the service list
1246  */
1247 static int ip_vs_del_service(struct ip_vs_service *svc)
1248 {
1249         if (svc == NULL)
1250                 return -EEXIST;
1251
1252         /*
1253          * Unhash it from the service table
1254          */
1255         write_lock_bh(&__ip_vs_svc_lock);
1256
1257         ip_vs_svc_unhash(svc);
1258
1259         /*
1260          * Wait until all the svc users go away.
1261          */
1262         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1263
1264         __ip_vs_del_service(svc);
1265
1266         write_unlock_bh(&__ip_vs_svc_lock);
1267
1268         return 0;
1269 }
1270
1271
1272 /*
1273  *      Flush all the virtual services
1274  */
1275 static int ip_vs_flush(void)
1276 {
1277         int idx;
1278         struct ip_vs_service *svc, *nxt;
1279
1280         /*
1281          * Flush the service table hashed by <protocol,addr,port>
1282          */
1283         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1284                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1285                         write_lock_bh(&__ip_vs_svc_lock);
1286                         ip_vs_svc_unhash(svc);
1287                         /*
1288                          * Wait until all the svc users go away.
1289                          */
1290                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1291                         __ip_vs_del_service(svc);
1292                         write_unlock_bh(&__ip_vs_svc_lock);
1293                 }
1294         }
1295
1296         /*
1297          * Flush the service table hashed by fwmark
1298          */
1299         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1300                 list_for_each_entry_safe(svc, nxt,
1301                                          &ip_vs_svc_fwm_table[idx], f_list) {
1302                         write_lock_bh(&__ip_vs_svc_lock);
1303                         ip_vs_svc_unhash(svc);
1304                         /*
1305                          * Wait until all the svc users go away.
1306                          */
1307                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1308                         __ip_vs_del_service(svc);
1309                         write_unlock_bh(&__ip_vs_svc_lock);
1310                 }
1311         }
1312
1313         return 0;
1314 }
1315
1316
1317 /*
1318  *      Zero counters in a service or all services
1319  */
1320 static int ip_vs_zero_service(struct ip_vs_service *svc)
1321 {
1322         struct ip_vs_dest *dest;
1323
1324         write_lock_bh(&__ip_vs_svc_lock);
1325         list_for_each_entry(dest, &svc->destinations, n_list) {
1326                 ip_vs_zero_stats(&dest->stats);
1327         }
1328         ip_vs_zero_stats(&svc->stats);
1329         write_unlock_bh(&__ip_vs_svc_lock);
1330         return 0;
1331 }
1332
1333 static int ip_vs_zero_all(void)
1334 {
1335         int idx;
1336         struct ip_vs_service *svc;
1337
1338         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1339                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1340                         ip_vs_zero_service(svc);
1341                 }
1342         }
1343
1344         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1345                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1346                         ip_vs_zero_service(svc);
1347                 }
1348         }
1349
1350         ip_vs_zero_stats(&ip_vs_stats);
1351         return 0;
1352 }
1353
1354
1355 static int
1356 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1357                      void __user *buffer, size_t *lenp, loff_t *ppos)
1358 {
1359         int *valp = table->data;
1360         int val = *valp;
1361         int rc;
1362
1363         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1364         if (write && (*valp != val)) {
1365                 if ((*valp < 0) || (*valp > 3)) {
1366                         /* Restore the correct value */
1367                         *valp = val;
1368                 } else {
1369                         update_defense_level();
1370                 }
1371         }
1372         return rc;
1373 }
1374
1375
1376 static int
1377 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1378                        void __user *buffer, size_t *lenp, loff_t *ppos)
1379 {
1380         int *valp = table->data;
1381         int val[2];
1382         int rc;
1383
1384         /* backup the value first */
1385         memcpy(val, valp, sizeof(val));
1386
1387         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1388         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1389                 /* Restore the correct value */
1390                 memcpy(valp, val, sizeof(val));
1391         }
1392         return rc;
1393 }
1394
1395
1396 /*
1397  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1398  */
1399
1400 static struct ctl_table vs_vars[] = {
1401         {
1402                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1403                 .procname       = "amemthresh",
1404                 .data           = &sysctl_ip_vs_amemthresh,
1405                 .maxlen         = sizeof(int),
1406                 .mode           = 0644,
1407                 .proc_handler   = &proc_dointvec,
1408         },
1409 #ifdef CONFIG_IP_VS_DEBUG
1410         {
1411                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1412                 .procname       = "debug_level",
1413                 .data           = &sysctl_ip_vs_debug_level,
1414                 .maxlen         = sizeof(int),
1415                 .mode           = 0644,
1416                 .proc_handler   = &proc_dointvec,
1417         },
1418 #endif
1419         {
1420                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1421                 .procname       = "am_droprate",
1422                 .data           = &sysctl_ip_vs_am_droprate,
1423                 .maxlen         = sizeof(int),
1424                 .mode           = 0644,
1425                 .proc_handler   = &proc_dointvec,
1426         },
1427         {
1428                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1429                 .procname       = "drop_entry",
1430                 .data           = &sysctl_ip_vs_drop_entry,
1431                 .maxlen         = sizeof(int),
1432                 .mode           = 0644,
1433                 .proc_handler   = &proc_do_defense_mode,
1434         },
1435         {
1436                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1437                 .procname       = "drop_packet",
1438                 .data           = &sysctl_ip_vs_drop_packet,
1439                 .maxlen         = sizeof(int),
1440                 .mode           = 0644,
1441                 .proc_handler   = &proc_do_defense_mode,
1442         },
1443         {
1444                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1445                 .procname       = "secure_tcp",
1446                 .data           = &sysctl_ip_vs_secure_tcp,
1447                 .maxlen         = sizeof(int),
1448                 .mode           = 0644,
1449                 .proc_handler   = &proc_do_defense_mode,
1450         },
1451 #if 0
1452         {
1453                 .ctl_name       = NET_IPV4_VS_TO_ES,
1454                 .procname       = "timeout_established",
1455                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1456                 .maxlen         = sizeof(int),
1457                 .mode           = 0644,
1458                 .proc_handler   = &proc_dointvec_jiffies,
1459         },
1460         {
1461                 .ctl_name       = NET_IPV4_VS_TO_SS,
1462                 .procname       = "timeout_synsent",
1463                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1464                 .maxlen         = sizeof(int),
1465                 .mode           = 0644,
1466                 .proc_handler   = &proc_dointvec_jiffies,
1467         },
1468         {
1469                 .ctl_name       = NET_IPV4_VS_TO_SR,
1470                 .procname       = "timeout_synrecv",
1471                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1472                 .maxlen         = sizeof(int),
1473                 .mode           = 0644,
1474                 .proc_handler   = &proc_dointvec_jiffies,
1475         },
1476         {
1477                 .ctl_name       = NET_IPV4_VS_TO_FW,
1478                 .procname       = "timeout_finwait",
1479                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1480                 .maxlen         = sizeof(int),
1481                 .mode           = 0644,
1482                 .proc_handler   = &proc_dointvec_jiffies,
1483         },
1484         {
1485                 .ctl_name       = NET_IPV4_VS_TO_TW,
1486                 .procname       = "timeout_timewait",
1487                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1488                 .maxlen         = sizeof(int),
1489                 .mode           = 0644,
1490                 .proc_handler   = &proc_dointvec_jiffies,
1491         },
1492         {
1493                 .ctl_name       = NET_IPV4_VS_TO_CL,
1494                 .procname       = "timeout_close",
1495                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1496                 .maxlen         = sizeof(int),
1497                 .mode           = 0644,
1498                 .proc_handler   = &proc_dointvec_jiffies,
1499         },
1500         {
1501                 .ctl_name       = NET_IPV4_VS_TO_CW,
1502                 .procname       = "timeout_closewait",
1503                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1504                 .maxlen         = sizeof(int),
1505                 .mode           = 0644,
1506                 .proc_handler   = &proc_dointvec_jiffies,
1507         },
1508         {
1509                 .ctl_name       = NET_IPV4_VS_TO_LA,
1510                 .procname       = "timeout_lastack",
1511                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1512                 .maxlen         = sizeof(int),
1513                 .mode           = 0644,
1514                 .proc_handler   = &proc_dointvec_jiffies,
1515         },
1516         {
1517                 .ctl_name       = NET_IPV4_VS_TO_LI,
1518                 .procname       = "timeout_listen",
1519                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1520                 .maxlen         = sizeof(int),
1521                 .mode           = 0644,
1522                 .proc_handler   = &proc_dointvec_jiffies,
1523         },
1524         {
1525                 .ctl_name       = NET_IPV4_VS_TO_SA,
1526                 .procname       = "timeout_synack",
1527                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1528                 .maxlen         = sizeof(int),
1529                 .mode           = 0644,
1530                 .proc_handler   = &proc_dointvec_jiffies,
1531         },
1532         {
1533                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1534                 .procname       = "timeout_udp",
1535                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1536                 .maxlen         = sizeof(int),
1537                 .mode           = 0644,
1538                 .proc_handler   = &proc_dointvec_jiffies,
1539         },
1540         {
1541                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1542                 .procname       = "timeout_icmp",
1543                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1544                 .maxlen         = sizeof(int),
1545                 .mode           = 0644,
1546                 .proc_handler   = &proc_dointvec_jiffies,
1547         },
1548 #endif
1549         {
1550                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1551                 .procname       = "cache_bypass",
1552                 .data           = &sysctl_ip_vs_cache_bypass,
1553                 .maxlen         = sizeof(int),
1554                 .mode           = 0644,
1555                 .proc_handler   = &proc_dointvec,
1556         },
1557         {
1558                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1559                 .procname       = "expire_nodest_conn",
1560                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1561                 .maxlen         = sizeof(int),
1562                 .mode           = 0644,
1563                 .proc_handler   = &proc_dointvec,
1564         },
1565         {
1566                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1567                 .procname       = "expire_quiescent_template",
1568                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1569                 .maxlen         = sizeof(int),
1570                 .mode           = 0644,
1571                 .proc_handler   = &proc_dointvec,
1572         },
1573         {
1574                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1575                 .procname       = "sync_threshold",
1576                 .data           = &sysctl_ip_vs_sync_threshold,
1577                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1578                 .mode           = 0644,
1579                 .proc_handler   = &proc_do_sync_threshold,
1580         },
1581         {
1582                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1583                 .procname       = "nat_icmp_send",
1584                 .data           = &sysctl_ip_vs_nat_icmp_send,
1585                 .maxlen         = sizeof(int),
1586                 .mode           = 0644,
1587                 .proc_handler   = &proc_dointvec,
1588         },
1589         { .ctl_name = 0 }
1590 };
1591
1592 static ctl_table vs_table[] = {
1593         {
1594                 .ctl_name       = NET_IPV4_VS,
1595                 .procname       = "vs",
1596                 .mode           = 0555,
1597                 .child          = vs_vars
1598         },
1599         { .ctl_name = 0 }
1600 };
1601
1602 static ctl_table ipvs_ipv4_table[] = {
1603         {
1604                 .ctl_name       = NET_IPV4,
1605                 .procname       = "ipv4",
1606                 .mode           = 0555,
1607                 .child          = vs_table,
1608         },
1609         { .ctl_name = 0 }
1610 };
1611
1612 static ctl_table vs_root_table[] = {
1613         {
1614                 .ctl_name       = CTL_NET,
1615                 .procname       = "net",
1616                 .mode           = 0555,
1617                 .child          = ipvs_ipv4_table,
1618         },
1619         { .ctl_name = 0 }
1620 };
1621
1622 static struct ctl_table_header * sysctl_header;
1623
1624 #ifdef CONFIG_PROC_FS
1625
1626 struct ip_vs_iter {
1627         struct list_head *table;
1628         int bucket;
1629 };
1630
1631 /*
1632  *      Write the contents of the VS rule table to a PROCfs file.
1633  *      (It is kept just for backward compatibility)
1634  */
1635 static inline const char *ip_vs_fwd_name(unsigned flags)
1636 {
1637         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1638         case IP_VS_CONN_F_LOCALNODE:
1639                 return "Local";
1640         case IP_VS_CONN_F_TUNNEL:
1641                 return "Tunnel";
1642         case IP_VS_CONN_F_DROUTE:
1643                 return "Route";
1644         default:
1645                 return "Masq";
1646         }
1647 }
1648
1649
1650 /* Get the Nth entry in the two lists */
1651 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1652 {
1653         struct ip_vs_iter *iter = seq->private;
1654         int idx;
1655         struct ip_vs_service *svc;
1656
1657         /* look in hash by protocol */
1658         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1659                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1660                         if (pos-- == 0){
1661                                 iter->table = ip_vs_svc_table;
1662                                 iter->bucket = idx;
1663                                 return svc;
1664                         }
1665                 }
1666         }
1667
1668         /* keep looking in fwmark */
1669         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1670                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1671                         if (pos-- == 0) {
1672                                 iter->table = ip_vs_svc_fwm_table;
1673                                 iter->bucket = idx;
1674                                 return svc;
1675                         }
1676                 }
1677         }
1678
1679         return NULL;
1680 }
1681
1682 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1683 {
1684
1685         read_lock_bh(&__ip_vs_svc_lock);
1686         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1687 }
1688
1689
1690 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1691 {
1692         struct list_head *e;
1693         struct ip_vs_iter *iter;
1694         struct ip_vs_service *svc;
1695
1696         ++*pos;
1697         if (v == SEQ_START_TOKEN)
1698                 return ip_vs_info_array(seq,0);
1699
1700         svc = v;
1701         iter = seq->private;
1702
1703         if (iter->table == ip_vs_svc_table) {
1704                 /* next service in table hashed by protocol */
1705                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1706                         return list_entry(e, struct ip_vs_service, s_list);
1707
1708
1709                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1710                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1711                                             s_list) {
1712                                 return svc;
1713                         }
1714                 }
1715
1716                 iter->table = ip_vs_svc_fwm_table;
1717                 iter->bucket = -1;
1718                 goto scan_fwmark;
1719         }
1720
1721         /* next service in hashed by fwmark */
1722         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1723                 return list_entry(e, struct ip_vs_service, f_list);
1724
1725  scan_fwmark:
1726         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1727                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1728                                     f_list)
1729                         return svc;
1730         }
1731
1732         return NULL;
1733 }
1734
1735 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1736 {
1737         read_unlock_bh(&__ip_vs_svc_lock);
1738 }
1739
1740
1741 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1742 {
1743         if (v == SEQ_START_TOKEN) {
1744                 seq_printf(seq,
1745                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1746                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1747                 seq_puts(seq,
1748                          "Prot LocalAddress:Port Scheduler Flags\n");
1749                 seq_puts(seq,
1750                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1751         } else {
1752                 const struct ip_vs_service *svc = v;
1753                 const struct ip_vs_iter *iter = seq->private;
1754                 const struct ip_vs_dest *dest;
1755
1756                 if (iter->table == ip_vs_svc_table)
1757                         seq_printf(seq, "%s  %08X:%04X %s ",
1758                                    ip_vs_proto_name(svc->protocol),
1759                                    ntohl(svc->addr),
1760                                    ntohs(svc->port),
1761                                    svc->scheduler->name);
1762                 else
1763                         seq_printf(seq, "FWM  %08X %s ",
1764                                    svc->fwmark, svc->scheduler->name);
1765
1766                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1767                         seq_printf(seq, "persistent %d %08X\n",
1768                                 svc->timeout,
1769                                 ntohl(svc->netmask));
1770                 else
1771                         seq_putc(seq, '\n');
1772
1773                 list_for_each_entry(dest, &svc->destinations, n_list) {
1774                         seq_printf(seq,
1775                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1776                                    ntohl(dest->addr), ntohs(dest->port),
1777                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1778                                    atomic_read(&dest->weight),
1779                                    atomic_read(&dest->activeconns),
1780                                    atomic_read(&dest->inactconns));
1781                 }
1782         }
1783         return 0;
1784 }
1785
1786 static struct seq_operations ip_vs_info_seq_ops = {
1787         .start = ip_vs_info_seq_start,
1788         .next  = ip_vs_info_seq_next,
1789         .stop  = ip_vs_info_seq_stop,
1790         .show  = ip_vs_info_seq_show,
1791 };
1792
1793 static int ip_vs_info_open(struct inode *inode, struct file *file)
1794 {
1795         struct seq_file *seq;
1796         int rc = -ENOMEM;
1797         struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1798
1799         if (!s)
1800                 goto out;
1801
1802         rc = seq_open(file, &ip_vs_info_seq_ops);
1803         if (rc)
1804                 goto out_kfree;
1805
1806         seq          = file->private_data;
1807         seq->private = s;
1808         memset(s, 0, sizeof(*s));
1809 out:
1810         return rc;
1811 out_kfree:
1812         kfree(s);
1813         goto out;
1814 }
1815
1816 static struct file_operations ip_vs_info_fops = {
1817         .owner   = THIS_MODULE,
1818         .open    = ip_vs_info_open,
1819         .read    = seq_read,
1820         .llseek  = seq_lseek,
1821         .release = seq_release_private,
1822 };
1823
1824 #endif
1825
1826 struct ip_vs_stats ip_vs_stats;
1827
1828 #ifdef CONFIG_PROC_FS
1829 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1830 {
1831
1832 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1833         seq_puts(seq,
1834                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1835         seq_printf(seq,
1836                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1837
1838         spin_lock_bh(&ip_vs_stats.lock);
1839         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1840                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1841                    (unsigned long long) ip_vs_stats.inbytes,
1842                    (unsigned long long) ip_vs_stats.outbytes);
1843
1844 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1845         seq_puts(seq,
1846                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1847         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1848                         ip_vs_stats.cps,
1849                         ip_vs_stats.inpps,
1850                         ip_vs_stats.outpps,
1851                         ip_vs_stats.inbps,
1852                         ip_vs_stats.outbps);
1853         spin_unlock_bh(&ip_vs_stats.lock);
1854
1855         return 0;
1856 }
1857
1858 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1859 {
1860         return single_open(file, ip_vs_stats_show, NULL);
1861 }
1862
1863 static struct file_operations ip_vs_stats_fops = {
1864         .owner = THIS_MODULE,
1865         .open = ip_vs_stats_seq_open,
1866         .read = seq_read,
1867         .llseek = seq_lseek,
1868         .release = single_release,
1869 };
1870
1871 #endif
1872
1873 /*
1874  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1875  */
1876 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1877 {
1878         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1879                   u->tcp_timeout,
1880                   u->tcp_fin_timeout,
1881                   u->udp_timeout);
1882
1883 #ifdef CONFIG_IP_VS_PROTO_TCP
1884         if (u->tcp_timeout) {
1885                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1886                         = u->tcp_timeout * HZ;
1887         }
1888
1889         if (u->tcp_fin_timeout) {
1890                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1891                         = u->tcp_fin_timeout * HZ;
1892         }
1893 #endif
1894
1895 #ifdef CONFIG_IP_VS_PROTO_UDP
1896         if (u->udp_timeout) {
1897                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1898                         = u->udp_timeout * HZ;
1899         }
1900 #endif
1901         return 0;
1902 }
1903
1904
1905 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1906 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1907 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1908                                  sizeof(struct ip_vs_dest_user))
1909 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1910 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1911 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1912
1913 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1914         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1915         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1916         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1917         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1918         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1919         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1920         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1921         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1922         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1923         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1924         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1925 };
1926
1927 static int
1928 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1929 {
1930         int ret;
1931         unsigned char arg[MAX_ARG_LEN];
1932         struct ip_vs_service_user *usvc;
1933         struct ip_vs_service *svc;
1934         struct ip_vs_dest_user *udest;
1935
1936         if (!capable(CAP_NET_ADMIN))
1937                 return -EPERM;
1938
1939         if (len != set_arglen[SET_CMDID(cmd)]) {
1940                 IP_VS_ERR("set_ctl: len %u != %u\n",
1941                           len, set_arglen[SET_CMDID(cmd)]);
1942                 return -EINVAL;
1943         }
1944
1945         if (copy_from_user(arg, user, len) != 0)
1946                 return -EFAULT;
1947
1948         /* increase the module use count */
1949         ip_vs_use_count_inc();
1950
1951         if (down_interruptible(&__ip_vs_mutex)) {
1952                 ret = -ERESTARTSYS;
1953                 goto out_dec;
1954         }
1955
1956         if (cmd == IP_VS_SO_SET_FLUSH) {
1957                 /* Flush the virtual service */
1958                 ret = ip_vs_flush();
1959                 goto out_unlock;
1960         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1961                 /* Set timeout values for (tcp tcpfin udp) */
1962                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1963                 goto out_unlock;
1964         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1965                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1966                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1967                 goto out_unlock;
1968         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1969                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1970                 ret = stop_sync_thread(dm->state);
1971                 goto out_unlock;
1972         }
1973
1974         usvc = (struct ip_vs_service_user *)arg;
1975         udest = (struct ip_vs_dest_user *)(usvc + 1);
1976
1977         if (cmd == IP_VS_SO_SET_ZERO) {
1978                 /* if no service address is set, zero counters in all */
1979                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1980                         ret = ip_vs_zero_all();
1981                         goto out_unlock;
1982                 }
1983         }
1984
1985         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1986         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1987                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1988                           usvc->protocol, NIPQUAD(usvc->addr),
1989                           ntohs(usvc->port), usvc->sched_name);
1990                 ret = -EFAULT;
1991                 goto out_unlock;
1992         }
1993
1994         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1995         if (usvc->fwmark == 0)
1996                 svc = __ip_vs_service_get(usvc->protocol,
1997                                           usvc->addr, usvc->port);
1998         else
1999                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2000
2001         if (cmd != IP_VS_SO_SET_ADD
2002             && (svc == NULL || svc->protocol != usvc->protocol)) {
2003                 ret = -ESRCH;
2004                 goto out_unlock;
2005         }
2006
2007         switch (cmd) {
2008         case IP_VS_SO_SET_ADD:
2009                 if (svc != NULL)
2010                         ret = -EEXIST;
2011                 else
2012                         ret = ip_vs_add_service(usvc, &svc);
2013                 break;
2014         case IP_VS_SO_SET_EDIT:
2015                 ret = ip_vs_edit_service(svc, usvc);
2016                 break;
2017         case IP_VS_SO_SET_DEL:
2018                 ret = ip_vs_del_service(svc);
2019                 if (!ret)
2020                         goto out_unlock;
2021                 break;
2022         case IP_VS_SO_SET_ZERO:
2023                 ret = ip_vs_zero_service(svc);
2024                 break;
2025         case IP_VS_SO_SET_ADDDEST:
2026                 ret = ip_vs_add_dest(svc, udest);
2027                 break;
2028         case IP_VS_SO_SET_EDITDEST:
2029                 ret = ip_vs_edit_dest(svc, udest);
2030                 break;
2031         case IP_VS_SO_SET_DELDEST:
2032                 ret = ip_vs_del_dest(svc, udest);
2033                 break;
2034         default:
2035                 ret = -EINVAL;
2036         }
2037
2038         if (svc)
2039                 ip_vs_service_put(svc);
2040
2041   out_unlock:
2042         up(&__ip_vs_mutex);
2043   out_dec:
2044         /* decrease the module use count */
2045         ip_vs_use_count_dec();
2046
2047         return ret;
2048 }
2049
2050
2051 static void
2052 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2053 {
2054         spin_lock_bh(&src->lock);
2055         memcpy(dst, src, (char*)&src->lock - (char*)src);
2056         spin_unlock_bh(&src->lock);
2057 }
2058
2059 static void
2060 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2061 {
2062         dst->protocol = src->protocol;
2063         dst->addr = src->addr;
2064         dst->port = src->port;
2065         dst->fwmark = src->fwmark;
2066         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2067         dst->flags = src->flags;
2068         dst->timeout = src->timeout / HZ;
2069         dst->netmask = src->netmask;
2070         dst->num_dests = src->num_dests;
2071         ip_vs_copy_stats(&dst->stats, &src->stats);
2072 }
2073
2074 static inline int
2075 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2076                             struct ip_vs_get_services __user *uptr)
2077 {
2078         int idx, count=0;
2079         struct ip_vs_service *svc;
2080         struct ip_vs_service_entry entry;
2081         int ret = 0;
2082
2083         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2084                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2085                         if (count >= get->num_services)
2086                                 goto out;
2087                         memset(&entry, 0, sizeof(entry));
2088                         ip_vs_copy_service(&entry, svc);
2089                         if (copy_to_user(&uptr->entrytable[count],
2090                                          &entry, sizeof(entry))) {
2091                                 ret = -EFAULT;
2092                                 goto out;
2093                         }
2094                         count++;
2095                 }
2096         }
2097
2098         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2099                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2100                         if (count >= get->num_services)
2101                                 goto out;
2102                         memset(&entry, 0, sizeof(entry));
2103                         ip_vs_copy_service(&entry, svc);
2104                         if (copy_to_user(&uptr->entrytable[count],
2105                                          &entry, sizeof(entry))) {
2106                                 ret = -EFAULT;
2107                                 goto out;
2108                         }
2109                         count++;
2110                 }
2111         }
2112   out:
2113         return ret;
2114 }
2115
2116 static inline int
2117 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2118                          struct ip_vs_get_dests __user *uptr)
2119 {
2120         struct ip_vs_service *svc;
2121         int ret = 0;
2122
2123         if (get->fwmark)
2124                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2125         else
2126                 svc = __ip_vs_service_get(get->protocol,
2127                                           get->addr, get->port);
2128         if (svc) {
2129                 int count = 0;
2130                 struct ip_vs_dest *dest;
2131                 struct ip_vs_dest_entry entry;
2132
2133                 list_for_each_entry(dest, &svc->destinations, n_list) {
2134                         if (count >= get->num_dests)
2135                                 break;
2136
2137                         entry.addr = dest->addr;
2138                         entry.port = dest->port;
2139                         entry.conn_flags = atomic_read(&dest->conn_flags);
2140                         entry.weight = atomic_read(&dest->weight);
2141                         entry.u_threshold = dest->u_threshold;
2142                         entry.l_threshold = dest->l_threshold;
2143                         entry.activeconns = atomic_read(&dest->activeconns);
2144                         entry.inactconns = atomic_read(&dest->inactconns);
2145                         entry.persistconns = atomic_read(&dest->persistconns);
2146                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2147                         if (copy_to_user(&uptr->entrytable[count],
2148                                          &entry, sizeof(entry))) {
2149                                 ret = -EFAULT;
2150                                 break;
2151                         }
2152                         count++;
2153                 }
2154                 ip_vs_service_put(svc);
2155         } else
2156                 ret = -ESRCH;
2157         return ret;
2158 }
2159
2160 static inline void
2161 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2162 {
2163 #ifdef CONFIG_IP_VS_PROTO_TCP
2164         u->tcp_timeout =
2165                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2166         u->tcp_fin_timeout =
2167                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2168 #endif
2169 #ifdef CONFIG_IP_VS_PROTO_UDP
2170         u->udp_timeout =
2171                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2172 #endif
2173 }
2174
2175
2176 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2177 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2178 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2179 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2180 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2181 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2182 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2183
2184 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2185         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2186         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2187         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2188         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2189         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2190         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2191         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2192 };
2193
2194 static int
2195 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2196 {
2197         unsigned char arg[128];
2198         int ret = 0;
2199
2200         if (!capable(CAP_NET_ADMIN))
2201                 return -EPERM;
2202
2203         if (*len < get_arglen[GET_CMDID(cmd)]) {
2204                 IP_VS_ERR("get_ctl: len %u < %u\n",
2205                           *len, get_arglen[GET_CMDID(cmd)]);
2206                 return -EINVAL;
2207         }
2208
2209         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2210                 return -EFAULT;
2211
2212         if (down_interruptible(&__ip_vs_mutex))
2213                 return -ERESTARTSYS;
2214
2215         switch (cmd) {
2216         case IP_VS_SO_GET_VERSION:
2217         {
2218                 char buf[64];
2219
2220                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2221                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2222                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2223                         ret = -EFAULT;
2224                         goto out;
2225                 }
2226                 *len = strlen(buf)+1;
2227         }
2228         break;
2229
2230         case IP_VS_SO_GET_INFO:
2231         {
2232                 struct ip_vs_getinfo info;
2233                 info.version = IP_VS_VERSION_CODE;
2234                 info.size = IP_VS_CONN_TAB_SIZE;
2235                 info.num_services = ip_vs_num_services;
2236                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2237                         ret = -EFAULT;
2238         }
2239         break;
2240
2241         case IP_VS_SO_GET_SERVICES:
2242         {
2243                 struct ip_vs_get_services *get;
2244                 int size;
2245
2246                 get = (struct ip_vs_get_services *)arg;
2247                 size = sizeof(*get) +
2248                         sizeof(struct ip_vs_service_entry) * get->num_services;
2249                 if (*len != size) {
2250                         IP_VS_ERR("length: %u != %u\n", *len, size);
2251                         ret = -EINVAL;
2252                         goto out;
2253                 }
2254                 ret = __ip_vs_get_service_entries(get, user);
2255         }
2256         break;
2257
2258         case IP_VS_SO_GET_SERVICE:
2259         {
2260                 struct ip_vs_service_entry *entry;
2261                 struct ip_vs_service *svc;
2262
2263                 entry = (struct ip_vs_service_entry *)arg;
2264                 if (entry->fwmark)
2265                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2266                 else
2267                         svc = __ip_vs_service_get(entry->protocol,
2268                                                   entry->addr, entry->port);
2269                 if (svc) {
2270                         ip_vs_copy_service(entry, svc);
2271                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2272                                 ret = -EFAULT;
2273                         ip_vs_service_put(svc);
2274                 } else
2275                         ret = -ESRCH;
2276         }
2277         break;
2278
2279         case IP_VS_SO_GET_DESTS:
2280         {
2281                 struct ip_vs_get_dests *get;
2282                 int size;
2283
2284                 get = (struct ip_vs_get_dests *)arg;
2285                 size = sizeof(*get) +
2286                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2287                 if (*len != size) {
2288                         IP_VS_ERR("length: %u != %u\n", *len, size);
2289                         ret = -EINVAL;
2290                         goto out;
2291                 }
2292                 ret = __ip_vs_get_dest_entries(get, user);
2293         }
2294         break;
2295
2296         case IP_VS_SO_GET_TIMEOUT:
2297         {
2298                 struct ip_vs_timeout_user t;
2299
2300                 __ip_vs_get_timeouts(&t);
2301                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2302                         ret = -EFAULT;
2303         }
2304         break;
2305
2306         case IP_VS_SO_GET_DAEMON:
2307         {
2308                 struct ip_vs_daemon_user d[2];
2309
2310                 memset(&d, 0, sizeof(d));
2311                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2312                         d[0].state = IP_VS_STATE_MASTER;
2313                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2314                         d[0].syncid = ip_vs_master_syncid;
2315                 }
2316                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2317                         d[1].state = IP_VS_STATE_BACKUP;
2318                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2319                         d[1].syncid = ip_vs_backup_syncid;
2320                 }
2321                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2322                         ret = -EFAULT;
2323         }
2324         break;
2325
2326         default:
2327                 ret = -EINVAL;
2328         }
2329
2330   out:
2331         up(&__ip_vs_mutex);
2332         return ret;
2333 }
2334
2335
2336 static struct nf_sockopt_ops ip_vs_sockopts = {
2337         .pf             = PF_INET,
2338         .set_optmin     = IP_VS_BASE_CTL,
2339         .set_optmax     = IP_VS_SO_SET_MAX+1,
2340         .set            = do_ip_vs_set_ctl,
2341         .get_optmin     = IP_VS_BASE_CTL,
2342         .get_optmax     = IP_VS_SO_GET_MAX+1,
2343         .get            = do_ip_vs_get_ctl,
2344 };
2345
2346
2347 int ip_vs_control_init(void)
2348 {
2349         int ret;
2350         int idx;
2351
2352         EnterFunction(2);
2353
2354         ret = nf_register_sockopt(&ip_vs_sockopts);
2355         if (ret) {
2356                 IP_VS_ERR("cannot register sockopt.\n");
2357                 return ret;
2358         }
2359
2360         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2361         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2362
2363         sysctl_header = register_sysctl_table(vs_root_table, 0);
2364
2365         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2366         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2367                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2368                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2369         }
2370         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2371                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2372         }
2373
2374         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2375         spin_lock_init(&ip_vs_stats.lock);
2376         ip_vs_new_estimator(&ip_vs_stats);
2377
2378         /* Hook the defense timer */
2379         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2380
2381         LeaveFunction(2);
2382         return 0;
2383 }
2384
2385
2386 void ip_vs_control_cleanup(void)
2387 {
2388         EnterFunction(2);
2389         ip_vs_trash_cleanup();
2390         cancel_rearming_delayed_work(&defense_work);
2391         ip_vs_kill_estimator(&ip_vs_stats);
2392         unregister_sysctl_table(sysctl_header);
2393         proc_net_remove("ip_vs_stats");
2394         proc_net_remove("ip_vs");
2395         nf_unregister_sockopt(&ip_vs_sockopts);
2396         LeaveFunction(2);
2397 }