]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/dccp/ccids/ccid3.c
dccp ccid-3: Preventing Oscillations
[net-next-2.6.git] / net / dccp / ccids / ccid3.c
CommitLineData
7c657876
ACM
1/*
2 * net/dccp/ccids/ccid3.c
3 *
954c2db8 4 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
b2f41ff4
IM
5 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
6 * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
7c657876
ACM
7 *
8 * An implementation of the DCCP protocol
9 *
10 * This code has been developed by the University of Waikato WAND
11 * research group. For further information please see http://www.wand.net.nz/
7c657876
ACM
12 *
13 * This code also uses code from Lulea University, rereleased as GPL by its
14 * authors:
15 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
16 *
17 * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
18 * and to make it work as a loadable module in the DCCP stack written by
19 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
20 *
21 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
22 *
23 * This program is free software; you can redistribute it and/or modify
24 * it under the terms of the GNU General Public License as published by
25 * the Free Software Foundation; either version 2 of the License, or
26 * (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with this program; if not, write to the Free Software
35 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
36 */
7c657876
ACM
37#include "../dccp.h"
38#include "ccid3.h"
39
76fd1e87
GR
40#include <asm/unaligned.h>
41
56724aa4
GR
42#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
43static int ccid3_debug;
44#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
7c657876
ACM
45#else
46#define ccid3_pr_debug(format, a...)
47#endif
48
9bf17475
GR
49/*
50 * Transmitter Half-Connection Routines
51 */
a3cbdde8
GR
52/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */
53static int do_osc_prev = true;
7c657876 54
a21f9f96 55/*
6c08b2cf
GR
56 * Compute the initial sending rate X_init in the manner of RFC 3390:
57 *
9d497a2c 58 * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT
6c08b2cf 59 *
a21f9f96
GR
60 * For consistency with other parts of the code, X_init is scaled by 2^6.
61 */
62static inline u64 rfc3390_initial_rate(struct sock *sk)
63{
9d497a2c
GR
64 const u32 mps = dccp_sk(sk)->dccps_mss_cache,
65 w_init = clamp(4380U, 2 * mps, 4 * mps);
a21f9f96 66
9d497a2c 67 return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt);
a21f9f96
GR
68}
69
de6f2b59 70/**
53ac9570
GR
71 * ccid3_update_send_interval - Calculate new t_ipi = s / X
72 * This respects the granularity of X (64 * bytes/second) and enforces the
73 * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342.
17893bc1 74 */
c4e18dad 75static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
7c657876 76{
53ac9570
GR
77 if (unlikely(hctx->x <= hctx->s))
78 hctx->x = hctx->s;
842d1ef1 79 hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x);
7c657876 80}
aa97efd9 81
a5358fdc
GR
82static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
83{
842d1ef1 84 u32 delta = ktime_us_delta(now, hctx->t_last_win_count);
a5358fdc 85
842d1ef1 86 return delta / hctx->rtt;
a5358fdc
GR
87}
88
aa97efd9
GR
89/**
90 * ccid3_hc_tx_update_x - Update allowed sending rate X
91 * @stamp: most recent time if available - can be left NULL.
92 * This function tracks draft rfc3448bis, check there for latest details.
5c3fbb6a 93 *
1a21e49a
GR
94 * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
95 * fine-grained resolution of sending rates. This requires scaling by 2^6
96 * throughout the code. Only X_calc is unscaled (in bytes/second).
97 *
1a21e49a 98 */
aa97efd9 99static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
7c657876 100{
59725dc2 101 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
842d1ef1
GR
102 u64 min_rate = 2 * hctx->x_recv;
103 const u64 old_x = hctx->x;
52515e77 104 ktime_t now = stamp ? *stamp : ktime_get_real();
7c657876 105
0c150efb
GR
106 /*
107 * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
a5358fdc
GR
108 * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
109 * a sender is idle if it has not sent anything over a 2-RTT-period.
0c150efb
GR
110 * For consistency with X and X_recv, min_rate is also scaled by 2^6.
111 */
a5358fdc 112 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
0c150efb 113 min_rate = rfc3390_initial_rate(sk);
842d1ef1 114 min_rate = max(min_rate, 2 * hctx->x_recv);
0c150efb
GR
115 }
116
842d1ef1 117 if (hctx->p > 0) {
1a21e49a 118
842d1ef1 119 hctx->x = min(((u64)hctx->x_calc) << 6, min_rate);
a79ef76f 120
842d1ef1 121 } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) {
ac198ea8 122
842d1ef1
GR
123 hctx->x = min(2 * hctx->x, min_rate);
124 hctx->x = max(hctx->x,
125 scaled_div(((u64)hctx->s) << 6, hctx->rtt));
126 hctx->t_ld = now;
ff586298 127 }
b6ee3d4a 128
842d1ef1 129 if (hctx->x != old_x) {
1761f7d7
GR
130 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
131 "X_recv=%u\n", (unsigned)(old_x >> 6),
842d1ef1
GR
132 (unsigned)(hctx->x >> 6), hctx->x_calc,
133 (unsigned)(hctx->x_recv >> 6));
8699be7d 134
1266adee 135 ccid3_update_send_interval(hctx);
8699be7d 136 }
7c657876
ACM
137}
138
78ad713d 139/*
c8f41d50
GR
140 * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1)
141 * @new_len: DCCP payload size in bytes (not used by all methods)
78ad713d 142 */
c8f41d50 143static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
78ad713d 144{
c8f41d50
GR
145#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG)
146 return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9);
147#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX)
148 return max(ccid3_hc_tx_sk(sk)->s, new_len);
149#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */
150 return dccp_sk(sk)->dccps_mss_cache;
151#endif
78ad713d
GR
152}
153
9f8681db 154/*
8109b02b 155 * Update Window Counter using the algorithm from [RFC 4342, 8.1].
825de27d 156 * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
9f8681db
GR
157 */
158static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
8132da4d 159 ktime_t now)
9f8681db 160{
842d1ef1
GR
161 u32 delta = ktime_us_delta(now, hctx->t_last_win_count),
162 quarter_rtts = (4 * delta) / hctx->rtt;
9f8681db
GR
163
164 if (quarter_rtts > 0) {
842d1ef1
GR
165 hctx->t_last_win_count = now;
166 hctx->last_win_count += min(quarter_rtts, 5U);
167 hctx->last_win_count &= 0xF; /* mod 16 */
9f8681db
GR
168 }
169}
170
7c657876
ACM
171static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
172{
173 struct sock *sk = (struct sock *)data;
59725dc2 174 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
2a1fda6f 175 unsigned long t_nfb = USEC_PER_SEC / 5;
7c657876
ACM
176
177 bh_lock_sock(sk);
178 if (sock_owned_by_user(sk)) {
179 /* Try again later. */
180 /* XXX: set some sensible MIB */
48e03eee 181 goto restart_timer;
7c657876
ACM
182 }
183
d0c05fe4
GR
184 ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk,
185 hctx->feedback ? "" : "out");
a9672411 186
d0995e6a
GR
187 /* Ignore and do not restart after leaving the established state */
188 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
189 goto out;
190
191 /* Reset feedback state to "no feedback received" */
d0c05fe4 192 hctx->feedback = false;
52515e77
GR
193
194 /*
195 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
842d1ef1 196 * RTO is 0 if and only if no feedback has been received yet.
52515e77 197 */
842d1ef1 198 if (hctx->t_rto == 0 || hctx->p == 0) {
52515e77
GR
199
200 /* halve send rate directly */
53ac9570 201 hctx->x /= 2;
1266adee 202 ccid3_update_send_interval(hctx);
53ac9570 203
52515e77 204 } else {
1f2333ae 205 /*
52515e77 206 * Modify the cached value of X_recv
0c150efb 207 *
52515e77 208 * If (X_calc > 2 * X_recv)
0c150efb
GR
209 * X_recv = max(X_recv / 2, s / (2 * t_mbi));
210 * Else
211 * X_recv = X_calc / 4;
212 *
213 * Note that X_recv is scaled by 2^6 while X_calc is not
1f2333ae 214 */
842d1ef1 215 BUG_ON(hctx->p && !hctx->x_calc);
0c150efb 216
842d1ef1 217 if (hctx->x_calc > (hctx->x_recv >> 5))
53ac9570 218 hctx->x_recv /= 2;
52515e77 219 else {
842d1ef1
GR
220 hctx->x_recv = hctx->x_calc;
221 hctx->x_recv <<= 4;
7c657876 222 }
aa97efd9 223 ccid3_hc_tx_update_x(sk, NULL);
7c657876 224 }
52515e77 225 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
842d1ef1 226 (unsigned long long)hctx->x);
52515e77
GR
227
228 /*
229 * Set new timeout for the nofeedback timer.
230 * See comments in packet_recv() regarding the value of t_RTO.
231 */
842d1ef1 232 if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */
52515e77
GR
233 t_nfb = TFRC_INITIAL_TIMEOUT;
234 else
842d1ef1 235 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
7c657876 236
48e03eee 237restart_timer:
842d1ef1 238 sk_reset_timer(sk, &hctx->no_feedback_timer,
c9eaf173 239 jiffies + usecs_to_jiffies(t_nfb));
7c657876
ACM
240out:
241 bh_unlock_sock(sk);
242 sock_put(sk);
243}
244
f4a66ca4
GR
245/**
246 * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
247 * @skb: next packet candidate to send on @sk
248 * This function uses the convention of ccid_packet_dequeue_eval() and
249 * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
7da7f456 250 */
6b57c93d 251static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
7c657876
ACM
252{
253 struct dccp_sock *dp = dccp_sk(sk);
59725dc2 254 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
8132da4d
GR
255 ktime_t now = ktime_get_real();
256 s64 delay;
7c657876 257
7c657876 258 /*
da335baf
GR
259 * This function is called only for Data and DataAck packets. Sending
260 * zero-sized Data(Ack)s is theoretically possible, but for congestion
261 * control this case is pathological - ignore it.
7c657876 262 */
6b57c93d 263 if (unlikely(skb->len == 0))
da335baf 264 return -EBADMSG;
7c657876 265
d0c05fe4 266 if (hctx->s == 0) {
842d1ef1 267 sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies +
c9eaf173 268 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
842d1ef1
GR
269 hctx->last_win_count = 0;
270 hctx->t_last_win_count = now;
90feeb95
GR
271
272 /* Set t_0 for initial packet */
842d1ef1 273 hctx->t_nom = now;
30833ffe 274
30833ffe
GR
275 /*
276 * Use initial RTT sample when available: recommended by erratum
277 * to RFC 4342. This implements the initialisation procedure of
278 * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
279 */
280 if (dp->dccps_syn_rtt) {
281 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
842d1ef1
GR
282 hctx->rtt = dp->dccps_syn_rtt;
283 hctx->x = rfc3390_initial_rate(sk);
284 hctx->t_ld = now;
30833ffe 285 } else {
3294f202
GR
286 /*
287 * Sender does not have RTT sample:
288 * - set fallback RTT (RFC 4340, 3.4) since a RTT value
289 * is needed in several parts (e.g. window counter);
290 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
291 */
842d1ef1 292 hctx->rtt = DCCP_FALLBACK_RTT;
9d497a2c 293 hctx->x = dp->dccps_mss_cache;
842d1ef1 294 hctx->x <<= 6;
30833ffe 295 }
c8f41d50
GR
296
297 /* Compute t_ipi = s / X */
298 hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
30833ffe
GR
299 ccid3_update_send_interval(hctx);
300
a3cbdde8
GR
301 /* Seed value for Oscillation Prevention (sec. 4.5) */
302 hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt);
303
d0995e6a 304 } else {
842d1ef1 305 delay = ktime_us_delta(hctx->t_nom, now);
8699be7d 306 ccid3_pr_debug("delay=%ld\n", (long)delay);
91cf5a17 307 /*
8109b02b 308 * Scheduling of packet transmissions [RFC 3448, 4.6]
91cf5a17
GR
309 *
310 * if (t_now > t_nom - delta)
311 * // send the packet now
312 * else
313 * // send the packet in (t_nom - t_now) milliseconds.
314 */
de6f2b59
GR
315 if (delay >= TFRC_T_DELTA)
316 return (u32)delay / USEC_PER_MSEC;
9f8681db 317
8132da4d 318 ccid3_hc_tx_update_win_count(hctx, now);
7c657876
ACM
319 }
320
7da7f456
GR
321 /* prepare to send now (add options etc.) */
322 dp->dccps_hc_tx_insert_options = 1;
842d1ef1 323 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count;
e312d100
GR
324
325 /* set the nominal send time for the next following packet */
842d1ef1 326 hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi);
f4a66ca4 327 return CCID_PACKET_SEND_AT_ONCE;
7c657876
ACM
328}
329
c506d91d 330static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
7c657876 331{
59725dc2 332 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
7c657876 333
c8f41d50
GR
334 /* Changes to s will become effective the next time X is computed */
335 hctx->s = ccid3_hc_tx_measure_packet_size(sk, len);
7c657876 336
842d1ef1 337 if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss))
c5a1ae9a 338 DCCP_CRIT("packet history - out of memory!");
7c657876
ACM
339}
340
341static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
342{
59725dc2 343 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
63b3a73b 344 struct tfrc_tx_hist_entry *acked;
0740d49c 345 ktime_t now;
2a1fda6f 346 unsigned long t_nfb;
ce177ae2 347 u32 r_sample;
1f2333ae 348
7c657876
ACM
349 /* we are only interested in ACKs */
350 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
351 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
352 return;
63b3a73b
GR
353 /*
354 * Locate the acknowledged packet in the TX history.
355 *
356 * Returning "entry not found" here can for instance happen when
357 * - the host has not sent out anything (e.g. a passive server),
358 * - the Ack is outdated (packet with higher Ack number was received),
359 * - it is a bogus Ack (for a packet not sent on this connection).
360 */
361 acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb));
362 if (acked == NULL)
5bd370a6 363 return;
63b3a73b
GR
364 /* For the sake of RTT sampling, ignore/remove all older entries */
365 tfrc_tx_hist_purge(&acked->next);
366
367 /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
368 now = ktime_get_real();
369 r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
370 hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
7c657876 371
d8d1252f
GR
372 /*
373 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
374 */
d0c05fe4
GR
375 if (!hctx->feedback) {
376 hctx->feedback = true;
5c3fbb6a 377
842d1ef1 378 if (hctx->t_rto == 0) {
d8d1252f
GR
379 /*
380 * Initial feedback packet: Larger Initial Windows (4.2)
381 */
842d1ef1
GR
382 hctx->x = rfc3390_initial_rate(sk);
383 hctx->t_ld = now;
a79ef76f 384
d8d1252f 385 ccid3_update_send_interval(hctx);
7c657876 386
d8d1252f 387 goto done_computing_x;
842d1ef1 388 } else if (hctx->p == 0) {
d8d1252f
GR
389 /*
390 * First feedback after nofeedback timer expiry (4.3)
391 */
392 goto done_computing_x;
393 }
394 }
7c657876 395
d8d1252f 396 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
842d1ef1
GR
397 if (hctx->p > 0)
398 hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p);
d8d1252f 399 ccid3_hc_tx_update_x(sk, &now);
7c657876 400
d8d1252f
GR
401done_computing_x:
402 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
5bd370a6 403 "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
842d1ef1
GR
404 dccp_role(sk), sk, hctx->rtt, r_sample,
405 hctx->s, hctx->p, hctx->x_calc,
406 (unsigned)(hctx->x_recv >> 6),
407 (unsigned)(hctx->x >> 6));
a3cbdde8
GR
408 /*
409 * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to
410 * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This
411 * can be useful if few connections share a link, avoiding that buffer
412 * fill levels (RTT) oscillate as a result of frequent adjustments to X.
413 * A useful presentation with background information is in
414 * Joerg Widmer, "Equation-Based Congestion Control",
415 * MSc Thesis, University of Mannheim, Germany, 2000
416 * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation").
417 */
418 if (do_osc_prev) {
419 r_sample = tfrc_scaled_sqrt(r_sample);
420 /*
421 * The modulation can work in both ways: increase/decrease t_ipi
422 * according to long-term increases/decreases of the RTT. The
423 * former is a useful measure, since it works against queue
424 * build-up. The latter temporarily increases the sending rate,
425 * so that buffers fill up more quickly. This in turn causes
426 * the RTT to increase, so that either later reduction becomes
427 * necessary or the RTT stays at a very high level. Decreasing
428 * t_ipi is therefore not supported.
429 * Furthermore, during the initial slow-start phase the RTT
430 * naturally increases, where using the algorithm would cause
431 * delays. Hence it is disabled during the initial slow-start.
432 */
433 if (r_sample > hctx->r_sqmean && hctx->p > 0)
434 hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample,
435 hctx->r_sqmean);
436 hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI);
437 /* update R_sqmean _after_ computing the modulation factor */
438 hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9);
439 }
7c657876 440
5bd370a6 441 /* unschedule no feedback timer */
842d1ef1 442 sk_stop_timer(sk, &hctx->no_feedback_timer);
7c657876 443
5bd370a6
GR
444 /*
445 * As we have calculated new ipi, delta, t_nom it is possible
446 * that we now can send a packet, so wake up dccp_wait_for_ccid
447 */
448 sk->sk_write_space(sk);
8c60f3fa 449
5bd370a6
GR
450 /*
451 * Update timeout interval for the nofeedback timer.
452 * We use a configuration option to increase the lower bound.
453 * This can help avoid triggering the nofeedback timer too
454 * often ('spinning') on LANs with small RTTs.
455 */
842d1ef1
GR
456 hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO *
457 (USEC_PER_SEC / 1000)));
5bd370a6
GR
458 /*
459 * Schedule no feedback timer to expire in
460 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
461 */
842d1ef1 462 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
7c657876 463
5bd370a6
GR
464 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
465 "expire in %lu jiffies (%luus)\n",
842d1ef1 466 dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
a9672411 467
842d1ef1 468 sk_reset_timer(sk, &hctx->no_feedback_timer,
5bd370a6 469 jiffies + usecs_to_jiffies(t_nfb));
7c657876
ACM
470}
471
3306c781
GR
472static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
473 u8 option, u8 *optval, u8 optlen)
7c657876 474{
59725dc2 475 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
76fd1e87 476 __be32 opt_val;
7c657876 477
7c657876 478 switch (option) {
47a61e7b 479 case TFRC_OPT_RECEIVE_RATE:
7c657876 480 case TFRC_OPT_LOSS_EVENT_RATE:
3306c781
GR
481 /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
482 if (packet_type == DCCP_PKT_DATA)
483 break;
484 if (unlikely(optlen != 4)) {
47a61e7b 485 DCCP_WARN("%s(%p), invalid len %d for %u\n",
3306c781 486 dccp_role(sk), sk, optlen, option);
47a61e7b 487 return -EINVAL;
7c657876 488 }
3306c781 489 opt_val = ntohl(get_unaligned((__be32 *)optval));
47a61e7b
GR
490
491 if (option == TFRC_OPT_RECEIVE_RATE) {
ce177ae2
GR
492 /* Receive Rate is kept in units of 64 bytes/second */
493 hctx->x_recv = opt_val;
494 hctx->x_recv <<= 6;
495
a9672411 496 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
47a61e7b
GR
497 dccp_role(sk), sk, opt_val);
498 } else {
ce177ae2
GR
499 /* Update the fixpoint Loss Event Rate fraction */
500 hctx->p = tfrc_invert_loss_event_rate(opt_val);
501
47a61e7b
GR
502 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
503 dccp_role(sk), sk, opt_val);
7c657876 504 }
7c657876 505 }
47a61e7b 506 return 0;
7c657876
ACM
507}
508
91f0ebf7 509static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
7c657876 510{
91f0ebf7 511 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
7c657876 512
842d1ef1
GR
513 hctx->hist = NULL;
514 setup_timer(&hctx->no_feedback_timer,
515 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
7c657876
ACM
516 return 0;
517}
518
519static void ccid3_hc_tx_exit(struct sock *sk)
520{
59725dc2 521 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
7c657876 522
842d1ef1 523 sk_stop_timer(sk, &hctx->no_feedback_timer);
842d1ef1 524 tfrc_tx_hist_purge(&hctx->hist);
7c657876
ACM
525}
526
9bf17475
GR
527static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
528{
b2e317f4
GR
529 info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto;
530 info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt;
9bf17475
GR
531}
532
533static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
534 u32 __user *optval, int __user *optlen)
535{
b2e317f4 536 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
842d1ef1 537 struct tfrc_tx_info tfrc;
9bf17475
GR
538 const void *val;
539
9bf17475
GR
540 switch (optname) {
541 case DCCP_SOCKOPT_CCID_TX_INFO:
842d1ef1 542 if (len < sizeof(tfrc))
9bf17475 543 return -EINVAL;
842d1ef1
GR
544 tfrc.tfrctx_x = hctx->x;
545 tfrc.tfrctx_x_recv = hctx->x_recv;
546 tfrc.tfrctx_x_calc = hctx->x_calc;
547 tfrc.tfrctx_rtt = hctx->rtt;
548 tfrc.tfrctx_p = hctx->p;
549 tfrc.tfrctx_rto = hctx->t_rto;
550 tfrc.tfrctx_ipi = hctx->t_ipi;
551 len = sizeof(tfrc);
552 val = &tfrc;
9bf17475
GR
553 break;
554 default:
555 return -ENOPROTOOPT;
556 }
557
558 if (put_user(len, optlen) || copy_to_user(optval, val, len))
559 return -EFAULT;
560
561 return 0;
562}
563
7c657876 564/*
9bf17475 565 * Receiver Half-Connection Routines
7c657876 566 */
b84a2189
ACM
567static void ccid3_hc_rx_send_feedback(struct sock *sk,
568 const struct sk_buff *skb,
569 enum ccid3_fback_type fbtype)
7c657876 570{
59725dc2 571 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
7c657876 572
b84a2189
ACM
573 switch (fbtype) {
574 case CCID3_FBACK_INITIAL:
842d1ef1
GR
575 hcrx->x_recv = 0;
576 hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */
7c657876 577 break;
b84a2189 578 case CCID3_FBACK_PARAM_CHANGE:
2f3e3bba 579 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
d20ed95f
GR
580 /*
581 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
582 * FIXME: in rfc3448bis the receiver returns X_recv=0
583 * here as it normally would in the first feedback packet.
584 * However this is not possible yet, since the code still
585 * uses RFC 3448, i.e.
586 * If (p > 0)
587 * Calculate X_calc using the TCP throughput equation.
588 * X = max(min(X_calc, 2*X_recv), s/t_mbi);
589 * would bring X down to s/t_mbi. That is why we return
590 * X_recv according to rfc3448bis-06 for the moment.
591 */
2b81143a
GR
592 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
593 rtt = tfrc_rx_hist_rtt(&hcrx->hist);
d20ed95f 594
d20ed95f
GR
595 hcrx->x_recv = scaled_div32(s, 2 * rtt);
596 break;
597 }
b84a2189
ACM
598 /*
599 * When parameters change (new loss or p > p_prev), we do not
600 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
68c89ee5 601 * always check whether at least RTT time units were covered.
b84a2189 602 */
68c89ee5
GR
603 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
604 break;
b84a2189 605 case CCID3_FBACK_PERIODIC:
2b81143a 606 /*
68c89ee5
GR
607 * Step (2) of rfc3448bis-06, 6.2:
608 * - if no data packets have been received, just restart timer
609 * - if data packets have been received, re-compute X_recv
2b81143a 610 */
68c89ee5
GR
611 if (hcrx->hist.bytes_recvd == 0)
612 goto prepare_for_next_time;
613 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
7c657876 614 break;
b84a2189 615 default:
7c657876
ACM
616 return;
617 }
618
68c89ee5 619 ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse);
7c657876 620
68c89ee5 621 dccp_sk(sk)->dccps_hc_rx_insert_options = 1;
7c657876 622 dccp_send_ack(sk);
68c89ee5
GR
623
624prepare_for_next_time:
625 tfrc_rx_hist_restart_byte_counter(&hcrx->hist);
626 hcrx->last_counter = dccp_hdr(skb)->dccph_ccval;
627 hcrx->feedback = fbtype;
7c657876
ACM
628}
629
2d0817d1 630static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
7c657876 631{
b2e317f4 632 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
60fe62e7 633 __be32 x_recv, pinv;
7c657876 634
59d203f9 635 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
2d0817d1 636 return 0;
7c657876 637
4fded33b 638 if (dccp_packet_without_ack(skb))
2d0817d1
ACM
639 return 0;
640
842d1ef1
GR
641 x_recv = htonl(hcrx->x_recv);
642 pinv = htonl(hcrx->p_inverse);
2d0817d1 643
385ac2e3 644 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
8109b02b 645 &pinv, sizeof(pinv)) ||
2d0817d1 646 dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
8109b02b 647 &x_recv, sizeof(x_recv)))
2d0817d1
ACM
648 return -1;
649
650 return 0;
7c657876
ACM
651}
652
954c2db8
GR
653/** ccid3_first_li - Implements [RFC 3448, 6.3.1]
654 *
655 * Determine the length of the first loss interval via inverse lookup.
656 * Assume that X_recv can be computed by the throughput equation
657 * s
658 * X_recv = --------
659 * R * fval
660 * Find some p such that f(p) = fval; return 1/p (scaled).
661 */
662static u32 ccid3_first_li(struct sock *sk)
663{
664 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
2b81143a 665 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
68c89ee5 666 rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p;
954c2db8
GR
667 u64 fval;
668
d20ed95f
GR
669 /*
670 * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p
671 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p
672 * is about 20.64%. This yields an interval length of 4.84 (rounded up).
673 */
2f3e3bba 674 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
d20ed95f
GR
675 return 5;
676
68c89ee5
GR
677 x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
678 if (x_recv == 0)
679 goto failed;
954c2db8 680
2b81143a 681 fval = scaled_div32(scaled_div(s, rtt), x_recv);
954c2db8
GR
682 p = tfrc_calc_x_reverse_lookup(fval);
683
684 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
685 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
686
68c89ee5
GR
687 if (p > 0)
688 return scaled_div(1, p);
689failed:
690 return UINT_MAX;
954c2db8
GR
691}
692
b84a2189 693static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
7c657876 694{
59725dc2 695 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
5b5d0e70 696 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
b84a2189
ACM
697 const bool is_data_packet = dccp_data_packet(skb);
698
d20ed95f
GR
699 /*
700 * Perform loss detection and handle pending losses
701 */
88e97a93
GR
702 if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist,
703 skb, ndp, ccid3_first_li, sk))
704 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE);
705 /*
706 * Feedback for first non-empty data packet (RFC 3448, 6.3)
707 */
708 else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet))
709 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL);
b84a2189
ACM
710 /*
711 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
712 */
88e97a93
GR
713 else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet &&
714 SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
715 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC);
7c657876
ACM
716}
717
91f0ebf7 718static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
7c657876 719{
91f0ebf7 720 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
7c657876 721
842d1ef1 722 tfrc_lh_init(&hcrx->li_hist);
24b8d343 723 return tfrc_rx_hist_init(&hcrx->hist, sk);
7c657876
ACM
724}
725
726static void ccid3_hc_rx_exit(struct sock *sk)
727{
59725dc2 728 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
7c657876 729
842d1ef1
GR
730 tfrc_rx_hist_purge(&hcrx->hist);
731 tfrc_lh_cleanup(&hcrx->li_hist);
7c657876
ACM
732}
733
2babe1f6
ACM
734static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
735{
8109b02b 736 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2b81143a 737 info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist);
2babe1f6
ACM
738}
739
88f964db
ACM
740static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
741 u32 __user *optval, int __user *optlen)
742{
b2e317f4 743 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
8e138e79 744 struct tfrc_rx_info rx_info;
88f964db 745 const void *val;
c9eaf173 746
88f964db
ACM
747 switch (optname) {
748 case DCCP_SOCKOPT_CCID_RX_INFO:
8e138e79 749 if (len < sizeof(rx_info))
88f964db 750 return -EINVAL;
842d1ef1 751 rx_info.tfrcrx_x_recv = hcrx->x_recv;
2b81143a 752 rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist);
535c55df 753 rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse);
8e138e79
GR
754 len = sizeof(rx_info);
755 val = &rx_info;
88f964db
ACM
756 break;
757 default:
758 return -ENOPROTOOPT;
759 }
760
761 if (put_user(len, optlen) || copy_to_user(optval, val, len))
762 return -EFAULT;
763
764 return 0;
765}
766
91f0ebf7 767static struct ccid_operations ccid3 = {
3dd9a7c3 768 .ccid_id = DCCPC_CCID3,
84a97b0a 769 .ccid_name = "TCP-Friendly Rate Control",
7c657876 770 .ccid_owner = THIS_MODULE,
91f0ebf7 771 .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
7c657876
ACM
772 .ccid_hc_tx_init = ccid3_hc_tx_init,
773 .ccid_hc_tx_exit = ccid3_hc_tx_exit,
774 .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
775 .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
776 .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
7c657876 777 .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
91f0ebf7 778 .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock),
7c657876
ACM
779 .ccid_hc_rx_init = ccid3_hc_rx_init,
780 .ccid_hc_rx_exit = ccid3_hc_rx_exit,
781 .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
782 .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
2babe1f6
ACM
783 .ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
784 .ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
88f964db
ACM
785 .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt,
786 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
7c657876 787};
8109b02b 788
a3cbdde8
GR
789module_param(do_osc_prev, bool, 0644);
790MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)");
791
56724aa4 792#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
43264991 793module_param(ccid3_debug, bool, 0644);
7c657876 794MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
56724aa4 795#endif
7c657876
ACM
796
797static __init int ccid3_module_init(void)
798{
f76fd327
GR
799 struct timespec tp;
800
801 /*
802 * Without a fine-grained clock resolution, RTTs/X_recv are not sampled
803 * correctly and feedback is sent either too early or too late.
804 */
805 hrtimer_get_res(CLOCK_MONOTONIC, &tp);
806 if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) {
807 printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec"
808 " resolution - check your clocksource.\n", __func__,
809 tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION);
810 return -ESOCKTNOSUPPORT;
811 }
34a9e7ea 812 return ccid_register(&ccid3);
7c657876
ACM
813}
814module_init(ccid3_module_init);
815
816static __exit void ccid3_module_exit(void)
817{
818 ccid_unregister(&ccid3);
7c657876
ACM
819}
820module_exit(ccid3_module_exit);
821
e6bccd35 822MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, "
1f2333ae 823 "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
7c657876
ACM
824MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
825MODULE_LICENSE("GPL");
826MODULE_ALIAS("net-dccp-ccid-3");