]> bbs.cooldavid.org Git - net-next-2.6.git/blobdiff - net/dccp/ccids/ccid3.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[net-next-2.6.git] / net / dccp / ccids / ccid3.c
index 95f7529864972aa34be0ba8ab3fd66b6eb640e47..3d604e1349c0b0e5d94ff755a90c7dd395b92a24 100644 (file)
@@ -54,7 +54,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
        [TFRC_SSTATE_NO_SENT]  = "NO_SENT",
        [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
        [TFRC_SSTATE_FBACK]    = "FBACK",
-       [TFRC_SSTATE_TERM]     = "TERM",
        };
 
        return ccid3_state_names[state];
@@ -91,19 +90,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
        return scaled_div(w_init << 6, hc->tx_rtt);
 }
 
-/*
- * Recalculate t_ipi and delta (should be called whenever X changes)
+/**
+ * ccid3_update_send_interval  -  Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
  */
 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
 {
-       /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
        hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
 
-       /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
-       hc->tx_delta = min_t(u32, hc->tx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN);
-
-       ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hc->tx_t_ipi,
-                      hc->tx_delta, hc->tx_s, (unsigned)(hc->tx_x >> 6));
+       ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
+                      hc->tx_s, (unsigned)(hc->tx_x >> 6));
 }
 
 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
@@ -211,16 +207,19 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
        ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
                       ccid3_tx_state_name(hc->tx_state));
 
+       /* Ignore and do not restart after leaving the established state */
+       if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+               goto out;
+
+       /* Reset feedback state to "no feedback received" */
        if (hc->tx_state == TFRC_SSTATE_FBACK)
                ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-       else if (hc->tx_state != TFRC_SSTATE_NO_FBACK)
-               goto out;
 
        /*
         * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+        * RTO is 0 if and only if no feedback has been received yet.
         */
-       if (hc->tx_t_rto == 0 ||        /* no feedback received yet */
-           hc->tx_p == 0) {
+       if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
 
                /* halve send rate directly */
                hc->tx_x = max(hc->tx_x / 2,
@@ -256,7 +255,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
         * Set new timeout for the nofeedback timer.
         * See comments in packet_recv() regarding the value of t_RTO.
         */
-       if (unlikely(hc->tx_t_rto == 0))        /* no feedback yet */
+       if (unlikely(hc->tx_t_rto == 0))        /* no feedback received yet */
                t_nfb = TFRC_INITIAL_TIMEOUT;
        else
                t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
@@ -269,11 +268,11 @@ out:
        sock_put(sk);
 }
 
-/*
- * returns
- *   > 0: delay (in msecs) that should pass before actually sending
- *   = 0: can send immediately
- *   < 0: error condition; do not send packet
+/**
+ * ccid3_hc_tx_send_packet  -  Delay-based dequeueing of TX packets
+ * @skb: next packet candidate to send on @sk
+ * This function uses the convention of ccid_packet_dequeue_eval() and
+ * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
  */
 static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
@@ -290,8 +289,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
        if (unlikely(skb->len == 0))
                return -EBADMSG;
 
-       switch (hc->tx_state) {
-       case TFRC_SSTATE_NO_SENT:
+       if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
                sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
                               usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
                hc->tx_last_win_count   = 0;
@@ -326,27 +324,22 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
                ccid3_update_send_interval(hc);
 
                ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-               break;
-       case TFRC_SSTATE_NO_FBACK:
-       case TFRC_SSTATE_FBACK:
+
+       } else {
                delay = ktime_us_delta(hc->tx_t_nom, now);
                ccid3_pr_debug("delay=%ld\n", (long)delay);
                /*
-                *      Scheduling of packet transmissions [RFC 3448, 4.6]
+                *      Scheduling of packet transmissions (RFC 5348, 8.3)
                 *
                 * if (t_now > t_nom - delta)
                 *       // send the packet now
                 * else
                 *       // send the packet in (t_nom - t_now) milliseconds.
                 */
-               if (delay - (s64)hc->tx_delta >= 1000)
-                       return (u32)delay / 1000L;
+               if (delay >= TFRC_T_DELTA)
+                       return (u32)delay / USEC_PER_MSEC;
 
                ccid3_hc_tx_update_win_count(hc, now);
-               break;
-       case TFRC_SSTATE_TERM:
-               DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
-               return -EINVAL;
        }
 
        /* prepare to send now (add options etc.) */
@@ -355,11 +348,10 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 
        /* set the nominal send time for the next following packet */
        hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
-       return 0;
+       return CCID_PACKET_SEND_AT_ONCE;
 }
 
-static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
-                                   unsigned int len)
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 {
        struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
 
@@ -372,48 +364,34 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
        struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-       struct ccid3_options_received *opt_recv;
+       struct tfrc_tx_hist_entry *acked;
        ktime_t now;
        unsigned long t_nfb;
-       u32 pinv, r_sample;
+       u32 r_sample;
 
        /* we are only interested in ACKs */
        if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
              DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
                return;
-       /* ... and only in the established state */
-       if (hc->tx_state != TFRC_SSTATE_FBACK &&
-           hc->tx_state != TFRC_SSTATE_NO_FBACK)
-               return;
-
-       opt_recv = &hc->tx_options_received;
-       now = ktime_get_real();
-
-       /* Estimate RTT from history if ACK number is valid */
-       r_sample = tfrc_tx_hist_rtt(hc->tx_hist,
-                                   DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
-       if (r_sample == 0) {
-               DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
-                         dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
-                         (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
-               return;
-       }
-
-       /* Update receive rate in units of 64 * bytes/second */
-       hc->tx_x_recv = opt_recv->ccid3or_receive_rate;
-       hc->tx_x_recv <<= 6;
-
-       /* Update loss event rate (which is scaled by 1e6) */
-       pinv = opt_recv->ccid3or_loss_event_rate;
-       if (pinv == ~0U || pinv == 0)          /* see RFC 4342, 8.5   */
-               hc->tx_p = 0;
-       else                                   /* can not exceed 100% */
-               hc->tx_p = scaled_div(1, pinv);
        /*
-        * Validate new RTT sample and update moving average
+        * Locate the acknowledged packet in the TX history.
+        *
+        * Returning "entry not found" here can for instance happen when
+        *  - the host has not sent out anything (e.g. a passive server),
+        *  - the Ack is outdated (packet with higher Ack number was received),
+        *  - it is a bogus Ack (for a packet not sent on this connection).
         */
-       r_sample = dccp_sample_rtt(sk, r_sample);
+       acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
+       if (acked == NULL)
+               return;
+       /* For the sake of RTT sampling, ignore/remove all older entries */
+       tfrc_tx_hist_purge(&acked->next);
+
+       /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+       now       = ktime_get_real();
+       r_sample  = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
        hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
+
        /*
         * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
         */
@@ -461,13 +439,12 @@ done_computing_x:
        sk->sk_write_space(sk);
 
        /*
-        * Update timeout interval for the nofeedback timer.
-        * We use a configuration option to increase the lower bound.
-        * This can help avoid triggering the nofeedback timer too
-        * often ('spinning') on LANs with small RTTs.
+        * Update timeout interval for the nofeedback timer. In order to control
+        * rate halving on networks with very low RTTs (<= 1 ms), use per-route
+        * tunable RTAX_RTO_MIN value as the lower bound.
         */
-       hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO *
-                                                      (USEC_PER_SEC / 1000)));
+       hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
+                                 USEC_PER_SEC/HZ * tcp_rto_min(sk));
        /*
         * Schedule no feedback timer to expire in
         * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
@@ -482,66 +459,41 @@ done_computing_x:
                           jiffies + usecs_to_jiffies(t_nfb));
 }
 
-static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
-                                    unsigned char len, u16 idx,
-                                    unsigned char *value)
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+                                    u8 option, u8 *optval, u8 optlen)
 {
-       int rc = 0;
-       const struct dccp_sock *dp = dccp_sk(sk);
        struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-       struct ccid3_options_received *opt_recv;
        __be32 opt_val;
 
-       opt_recv = &hc->tx_options_received;
-
-       if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
-               opt_recv->ccid3or_seqno              = dp->dccps_gsr;
-               opt_recv->ccid3or_loss_event_rate    = ~0;
-               opt_recv->ccid3or_loss_intervals_idx = 0;
-               opt_recv->ccid3or_loss_intervals_len = 0;
-               opt_recv->ccid3or_receive_rate       = 0;
-       }
-
        switch (option) {
+       case TFRC_OPT_RECEIVE_RATE:
        case TFRC_OPT_LOSS_EVENT_RATE:
-               if (unlikely(len != 4)) {
-                       DCCP_WARN("%s(%p), invalid len %d "
-                                 "for TFRC_OPT_LOSS_EVENT_RATE\n",
-                                 dccp_role(sk), sk, len);
-                       rc = -EINVAL;
-               } else {
-                       opt_val = get_unaligned((__be32 *)value);
-                       opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
-                       ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
-                                      dccp_role(sk), sk,
-                                      opt_recv->ccid3or_loss_event_rate);
+               /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+               if (packet_type == DCCP_PKT_DATA)
+                       break;
+               if (unlikely(optlen != 4)) {
+                       DCCP_WARN("%s(%p), invalid len %d for %u\n",
+                                 dccp_role(sk), sk, optlen, option);
+                       return -EINVAL;
                }
-               break;
-       case TFRC_OPT_LOSS_INTERVALS:
-               opt_recv->ccid3or_loss_intervals_idx = idx;
-               opt_recv->ccid3or_loss_intervals_len = len;
-               ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
-                              dccp_role(sk), sk,
-                              opt_recv->ccid3or_loss_intervals_idx,
-                              opt_recv->ccid3or_loss_intervals_len);
-               break;
-       case TFRC_OPT_RECEIVE_RATE:
-               if (unlikely(len != 4)) {
-                       DCCP_WARN("%s(%p), invalid len %d "
-                                 "for TFRC_OPT_RECEIVE_RATE\n",
-                                 dccp_role(sk), sk, len);
-                       rc = -EINVAL;
-               } else {
-                       opt_val = get_unaligned((__be32 *)value);
-                       opt_recv->ccid3or_receive_rate = ntohl(opt_val);
+               opt_val = ntohl(get_unaligned((__be32 *)optval));
+
+               if (option == TFRC_OPT_RECEIVE_RATE) {
+                       /* Receive Rate is kept in units of 64 bytes/second */
+                       hc->tx_x_recv = opt_val;
+                       hc->tx_x_recv <<= 6;
+
                        ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
-                                      dccp_role(sk), sk,
-                                      opt_recv->ccid3or_receive_rate);
+                                      dccp_role(sk), sk, opt_val);
+               } else {
+                       /* Update the fixpoint Loss Event Rate fraction */
+                       hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
+
+                       ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+                                      dccp_role(sk), sk, opt_val);
                }
-               break;
        }
-
-       return rc;
+       return 0;
 }
 
 static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -559,42 +511,36 @@ static void ccid3_hc_tx_exit(struct sock *sk)
 {
        struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
 
-       ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
        sk_stop_timer(sk, &hc->tx_no_feedback_timer);
-
        tfrc_tx_hist_purge(&hc->tx_hist);
 }
 
 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
 {
-       struct ccid3_hc_tx_sock *hc;
-
-       /* Listen socks doesn't have a private CCID block */
-       if (sk->sk_state == DCCP_LISTEN)
-               return;
-
-       hc = ccid3_hc_tx_sk(sk);
-       info->tcpi_rto = hc->tx_t_rto;
-       info->tcpi_rtt = hc->tx_rtt;
+       info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
+       info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
 }
 
 static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
                                  u32 __user *optval, int __user *optlen)
 {
-       const struct ccid3_hc_tx_sock *hc;
+       const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+       struct tfrc_tx_info tfrc;
        const void *val;
 
-       /* Listen socks doesn't have a private CCID block */
-       if (sk->sk_state == DCCP_LISTEN)
-               return -EINVAL;
-
-       hc = ccid3_hc_tx_sk(sk);
        switch (optname) {
        case DCCP_SOCKOPT_CCID_TX_INFO:
-               if (len < sizeof(hc->tx_tfrc))
+               if (len < sizeof(tfrc))
                        return -EINVAL;
-               len = sizeof(hc->tx_tfrc);
-               val = &hc->tx_tfrc;
+               tfrc.tfrctx_x      = hc->tx_x;
+               tfrc.tfrctx_x_recv = hc->tx_x_recv;
+               tfrc.tfrctx_x_calc = hc->tx_x_calc;
+               tfrc.tfrctx_rtt    = hc->tx_rtt;
+               tfrc.tfrctx_p      = hc->tx_p;
+               tfrc.tfrctx_rto    = hc->tx_t_rto;
+               tfrc.tfrctx_ipi    = hc->tx_t_ipi;
+               len = sizeof(tfrc);
+               val = &tfrc;
                break;
        default:
                return -ENOPROTOOPT;
@@ -624,7 +570,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
        static const char *const ccid3_rx_state_names[] = {
        [TFRC_RSTATE_NO_DATA] = "NO_DATA",
        [TFRC_RSTATE_DATA]    = "DATA",
-       [TFRC_RSTATE_TERM]    = "TERM",
        };
 
        return ccid3_rx_state_names[state];
@@ -650,14 +595,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 {
        struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
        struct dccp_sock *dp = dccp_sk(sk);
-       ktime_t now;
+       ktime_t now = ktime_get_real();
        s64 delta = 0;
 
-       if (unlikely(hc->rx_state == TFRC_RSTATE_TERM))
-               return;
-
-       now = ktime_get_real();
-
        switch (fbtype) {
        case CCID3_FBACK_INITIAL:
                hc->rx_x_recv = 0;
@@ -701,14 +641,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 
 static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 {
-       const struct ccid3_hc_rx_sock *hc;
+       const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
        __be32 x_recv, pinv;
 
        if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
                return 0;
 
-       hc = ccid3_hc_rx_sk(sk);
-
        if (dccp_packet_without_ack(skb))
                return 0;
 
@@ -749,10 +687,11 @@ static u32 ccid3_first_li(struct sock *sk)
        x_recv = scaled_div32(hc->rx_bytes_recv, delta);
        if (x_recv == 0) {              /* would also trigger divide-by-zero */
                DCCP_WARN("X_recv==0\n");
-               if ((x_recv = hc->rx_x_recv) == 0) {
+               if (hc->rx_x_recv == 0) {
                        DCCP_BUG("stored value of X_recv is zero");
                        return ~0U;
                }
+               x_recv = hc->rx_x_recv;
        }
 
        fval = scaled_div(hc->rx_s, hc->rx_rtt);
@@ -862,46 +801,31 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 {
        struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
 
-       ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
-
        tfrc_rx_hist_purge(&hc->rx_hist);
        tfrc_lh_cleanup(&hc->rx_li_hist);
 }
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 {
-       const struct ccid3_hc_rx_sock *hc;
-
-       /* Listen socks doesn't have a private CCID block */
-       if (sk->sk_state == DCCP_LISTEN)
-               return;
-
-       hc = ccid3_hc_rx_sk(sk);
-       info->tcpi_ca_state = hc->rx_state;
+       info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
        info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
-       info->tcpi_rcv_rtt  = hc->rx_rtt;
+       info->tcpi_rcv_rtt  = ccid3_hc_rx_sk(sk)->rx_rtt;
 }
 
 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
                                  u32 __user *optval, int __user *optlen)
 {
-       const struct ccid3_hc_rx_sock *hc;
+       const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
        struct tfrc_rx_info rx_info;
        const void *val;
 
-       /* Listen socks doesn't have a private CCID block */
-       if (sk->sk_state == DCCP_LISTEN)
-               return -EINVAL;
-
-       hc = ccid3_hc_rx_sk(sk);
        switch (optname) {
        case DCCP_SOCKOPT_CCID_RX_INFO:
                if (len < sizeof(rx_info))
                        return -EINVAL;
                rx_info.tfrcrx_x_recv = hc->rx_x_recv;
                rx_info.tfrcrx_rtt    = hc->rx_rtt;
-               rx_info.tfrcrx_p      = hc->rx_pinv == 0 ? ~0U :
-                                          scaled_div(1, hc->rx_pinv);
+               rx_info.tfrcrx_p      = tfrc_invert_loss_event_rate(hc->rx_pinv);
                len = sizeof(rx_info);
                val = &rx_info;
                break;