source: src/linux/universal/linux-3.18/net/shortcut-fe/sfe_cm.c @ 33046

Last change on this file since 33046 was 33046, checked in by brainslayer, 10 days ago

dissent: sfe: support qos ingress shaping

File size: 31.8 KB
Line 
1/*
2 * sfe-cm.c
3 *      Shortcut forwarding engine connection manager.
4 *
5 * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved.
6 * Permission to use, copy, modify, and/or distribute this software for
7 * any purpose with or without fee is hereby granted, provided that the
8 * above copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
15 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <linux/module.h>
19#include <linux/sysfs.h>
20#include <linux/skbuff.h>
21#include <net/route.h>
22#include <net/ip6_route.h>
23#include <net/addrconf.h>
24#include <net/dsfield.h>
25#include <linux/inetdevice.h>
26#include <linux/netfilter_bridge.h>
27#include <linux/netfilter_ipv6.h>
28#include <net/netfilter/nf_conntrack_acct.h>
29#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_zones.h>
31#include <net/netfilter/nf_conntrack_core.h>
32#include <net/netfilter/nf_conntrack_timeout.h>
33#include <linux/netfilter/xt_dscp.h>
34#include <linux/if_bridge.h>
35#include <net/pkt_sched.h>
36
37#include "sfe.h"
38#include "sfe_cm.h"
39#include "sfe_backport.h"
40
41#include "sfe_ipv4.c"
42#ifdef SFE_SUPPORT_IPV6
43#include "sfe_ipv6.c"
44#endif
45#include "fast-classifier.c"
46
47typedef enum sfe_cm_exception {
48        SFE_CM_EXCEPTION_PACKET_BROADCAST,
49        SFE_CM_EXCEPTION_PACKET_MULTICAST,
50        SFE_CM_EXCEPTION_NO_IIF,
51        SFE_CM_EXCEPTION_NO_CT,
52        SFE_CM_EXCEPTION_CT_NO_TRACK,
53        SFE_CM_EXCEPTION_CT_NO_CONFIRM,
54        SFE_CM_EXCEPTION_CT_IS_ALG,
55        SFE_CM_EXCEPTION_IS_IPV4_MCAST,
56        SFE_CM_EXCEPTION_IS_IPV6_MCAST,
57        SFE_CM_EXCEPTION_TCP_NOT_ASSURED,
58        SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED,
59        SFE_CM_EXCEPTION_UNKNOW_PROTOCOL,
60        SFE_CM_EXCEPTION_NO_SRC_DEV,
61        SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV,
62        SFE_CM_EXCEPTION_NO_DEST_DEV,
63        SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV,
64        SFE_CM_EXCEPTION_NO_BRIDGE,
65        SFE_CM_EXCEPTION_LOCAL_OUT,
66        SFE_CM_EXCEPTION_MAX
67} sfe_cm_exception_t;
68
69static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = {
70        "PACKET_BROADCAST",
71        "PACKET_MULTICAST",
72        "NO_IIF",
73        "NO_CT",
74        "CT_NO_TRACK",
75        "CT_NO_CONFIRM",
76        "CT_IS_ALG",
77        "IS_IPV4_MCAST",
78        "IS_IPV6_MCAST",
79        "TCP_NOT_ASSURED",
80        "TCP_NOT_ESTABLISHED",
81        "UNKNOW_PROTOCOL",
82        "NO_SRC_DEV",
83        "NO_SRC_XLATE_DEV",
84        "NO_DEST_DEV",
85        "NO_DEST_XLATE_DEV",
86        "NO_BRIDGE",
87        "LOCAL_OUT"
88};
89
90/*
91 * Per-module structure.
92 */
93struct sfe_cm {
94        spinlock_t lock;                /* Lock for SMP correctness */
95
96        /*
97         * Control state.
98         */
99        struct kobject *sys_sfe_cm;     /* sysfs linkage */
100
101        /*
102         * Callback notifiers.
103         */
104        struct notifier_block dev_notifier;     /* Device notifier */
105        struct notifier_block inet_notifier;    /* IPv4 notifier */
106#ifdef SFE_SUPPORT_IPV6
107        struct notifier_block inet6_notifier;   /* IPv6 notifier */
108#endif
109        u32 exceptions[SFE_CM_EXCEPTION_MAX];
110};
111
112static struct sfe_cm __sc;
113
114
115/*
116 * sfe_cm_incr_exceptions()
117 *      increase an exception counter.
118 */
119static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except)
120{
121        struct sfe_cm *sc = &__sc;
122
123        spin_lock_bh(&sc->lock);
124        sc->exceptions[except]++;
125        spin_unlock_bh(&sc->lock);
126}
127
128/*
129 * sfe_cm_recv()
130 *      Handle packet receives.
131 *
132 * Returns 1 if the packet is forwarded or 0 if it isn't.
133 */
134static int sfe_cm_recv(struct sk_buff *skb)
135{
136        struct net_device *dev;
137
138        /*
139         * We know that for the vast majority of packets we need the transport
140         * layer header so we may as well start to fetch it now!
141         */
142        prefetch(skb->data + 32);
143        barrier();
144
145        dev = skb->dev;
146
147#ifdef CONFIG_NET_CLS_ACT
148        /*
149         * If ingress Qdisc configured, and packet not processed by ingress Qdisc yet
150         * We cannot accelerate this packet.
151         */
152        if (dev->ingress_queue && !(skb->tc_verd & TC_NCLS)) {
153                return 0;
154        }
155#endif
156
157        /*
158         * We're only interested in IPv4 and IPv6 packets.
159         */
160        if (likely(htons(ETH_P_IP) == skb->protocol)) {
161                struct in_device *in_dev;
162
163                /*
164                 * Does our input device support IP processing?
165                 */
166                in_dev = (struct in_device *)dev->ip_ptr;
167                if (unlikely(!in_dev)) {
168                        DEBUG_TRACE("no IP processing for device: %s\n", dev->name);
169                        return 0;
170                }
171
172                /*
173                 * Does it have an IP address?  If it doesn't then we can't do anything
174                 * interesting here!
175                 */
176                if (unlikely(!in_dev->ifa_list)) {
177                        DEBUG_TRACE("no IP address for device: %s\n", dev->name);
178                        return 0;
179                }
180
181                return sfe_ipv4_recv(dev, skb);
182        }
183
184#ifdef SFE_SUPPORT_IPV6
185        if (likely(htons(ETH_P_IPV6) == skb->protocol)) {
186                struct inet6_dev *in_dev;
187
188                /*
189                 * Does our input device support IPv6 processing?
190                 */
191                in_dev = (struct inet6_dev *)dev->ip6_ptr;
192                if (unlikely(!in_dev)) {
193                        DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name);
194                        return 0;
195                }
196
197                /*
198                 * Does it have an IPv6 address?  If it doesn't then we can't do anything
199                 * interesting here!
200                 */
201                if (unlikely(list_empty(&in_dev->addr_list))) {
202                        DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name);
203                        return 0;
204                }
205
206                return sfe_ipv6_recv(dev, skb);
207        }
208#endif
209        DEBUG_TRACE("not IP packet\n");
210        return 0;
211}
212
213/*
214 * sfe_cm_find_dev_and_mac_addr()
215 *      Find the device and MAC address for a given IPv4/IPv6 address.
216 *
217 * Returns true if we find the device and MAC address, otherwise false.
218 *
219 * We look up the rtable entry for the address and, from its neighbour
220 * structure, obtain the hardware address.  This means this function also
221 * works if the neighbours are routers too.
222 */
223static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4)
224{
225        struct neighbour *neigh;
226        struct rtable *rt;
227        struct rt6_info *rt6 = NULL;
228        struct dst_entry *dst;
229        struct net_device *mac_dev;
230
231        /*
232         * Look up the rtable entry for the IP address then get the hardware
233         * address from its neighbour structure.  This means this work when the
234         * neighbours are routers too.
235         */
236        if (likely(is_v4)) {
237                rt = ip_route_output(&init_net, addr->ip, 0, 0, 0);
238                if (unlikely(IS_ERR(rt))) {
239                        goto ret_fail;
240                }
241
242                dst = (struct dst_entry *)rt;
243        } else {
244                if (rt6_lookup)
245                    rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0);
246                if (!rt6) {
247                        goto ret_fail;
248                }
249
250                dst = (struct dst_entry *)rt6;
251        }
252
253        rcu_read_lock();
254        neigh = dst_neigh_lookup(dst, addr);
255        if (unlikely(!neigh)) {
256                rcu_read_unlock();
257                dst_release(dst);
258                goto ret_fail;
259        }
260
261        if (unlikely(!(neigh->nud_state & NUD_VALID))) {
262                rcu_read_unlock();
263                neigh_release(neigh);
264                dst_release(dst);
265                goto ret_fail;
266        }
267
268        mac_dev = neigh->dev;
269        if (!mac_dev) {
270                rcu_read_unlock();
271                neigh_release(neigh);
272                dst_release(dst);
273                goto ret_fail;
274        }
275
276        memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len);
277
278        dev_hold(mac_dev);
279        *dev = mac_dev;
280        rcu_read_unlock();
281        neigh_release(neigh);
282        dst_release(dst);
283
284        return true;
285
286ret_fail:
287        if (is_v4) {
288                DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip);
289
290        } else {
291                DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6);
292        }
293
294        return false;
295}
296
297EXPORT_SYMBOL(sfe_cm_find_dev_and_mac_addr);
298/*
299 * sfe_cm_post_routing()
300 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
301 */
302static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4)
303{
304        struct sfe_connection_create sic;
305        struct net_device *in;
306        struct nf_conn *ct;
307        enum ip_conntrack_info ctinfo;
308        struct net_device *dev;
309        struct net_device *src_dev;
310        struct net_device *dest_dev;
311        struct net_device *src_br_dev = NULL;
312        struct net_device *dest_br_dev = NULL;
313        struct nf_conntrack_tuple orig_tuple;
314        struct nf_conntrack_tuple reply_tuple;
315        SFE_NF_CONN_ACCT(acct);
316
317        /*
318         * Don't process broadcast or multicast packets.
319         */
320        if (unlikely(skb->pkt_type == PACKET_BROADCAST)) {
321                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST);
322                DEBUG_TRACE("broadcast, ignoring\n");
323                return NF_ACCEPT;
324        }
325        if (unlikely(skb->pkt_type == PACKET_MULTICAST)) {
326                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST);
327                DEBUG_TRACE("multicast, ignoring\n");
328                return NF_ACCEPT;
329        }
330
331#ifdef CONFIG_XFRM
332        /*
333         * Packet to xfrm for encapsulation, we can't process it
334         */
335        if (unlikely(skb_dst(skb)->xfrm)) {
336                DEBUG_TRACE("packet to xfrm, ignoring\n");
337                return NF_ACCEPT;
338        }
339#endif
340
341        /*
342         * Don't process locally generated packets.
343         */
344        if (skb->sk) {
345                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT);
346                DEBUG_TRACE("skip local out packet\n");
347                return NF_ACCEPT;
348        }
349
350        /*
351         * Don't process packets that are not being forwarded.
352         */
353        in = dev_get_by_index(&init_net, skb->skb_iif);
354        if (!in) {
355                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF);
356                DEBUG_TRACE("packet not forwarding\n");
357                return NF_ACCEPT;
358        }
359
360        dev_put(in);
361
362        /*
363         * Don't process packets that aren't being tracked by conntrack.
364         */
365        ct = nf_ct_get(skb, &ctinfo);
366        if (unlikely(!ct)) {
367                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT);
368                DEBUG_TRACE("no conntrack connection, ignoring\n");
369                return NF_ACCEPT;
370        }
371
372        /*
373         * Don't process untracked connections.
374         */
375        if (unlikely(nf_ct_is_untracked(ct))) {
376                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK);
377                DEBUG_TRACE("untracked connection\n");
378                return NF_ACCEPT;
379        }
380
381        /*
382         * Unconfirmed connection may be dropped by Linux at the final step,
383         * So we don't process unconfirmed connections.
384         */
385        if (!nf_ct_is_confirmed(ct)) {
386                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM);
387                DEBUG_TRACE("unconfirmed connection\n");
388                return NF_ACCEPT;
389        }
390
391        /*
392         * Don't process connections that require support from a 'helper' (typically a NAT ALG).
393         */
394        if (unlikely(nfct_help(ct))) {
395                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG);
396                DEBUG_TRACE("connection has helper\n");
397                return NF_ACCEPT;
398        }
399
400        /*
401         * Check if the acceleration of a flow could be rejected quickly.
402         */
403        acct = nf_conn_acct_find(ct);
404        if (acct) {
405                long long packets = atomic64_read(&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets);
406                if ((packets > 0xff) && (packets & 0xff)) {
407                        /*
408                         * Connection hits slow path at least 256 times, so it must be not able to accelerate.
409                         * But we also give it a chance to walk through ECM every 256 packets
410                         */
411                        return NF_ACCEPT;
412                }
413        }
414
415        /*
416         * Look up the details of our connection in conntrack.
417         *
418         * Note that the data we get from conntrack is for the "ORIGINAL" direction
419         * but our packet may actually be in the "REPLY" direction.
420         */
421        orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
422        reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
423        sic.protocol = (s32)orig_tuple.dst.protonum;
424
425        sic.flags = 0;
426
427        /*
428         * Get addressing information, non-NAT first
429         */
430        if (likely(is_v4)) {
431                u32 dscp;
432
433                sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
434                sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
435
436                if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) {
437                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST);
438                        DEBUG_TRACE("multicast address\n");
439                        return NF_ACCEPT;
440                }
441
442                /*
443                 * NAT'ed addresses - note these are as seen from the 'reply' direction
444                 * When NAT does not apply to this connection these will be identical to the above.
445                 */
446                sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip;
447                sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip;
448
449                dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
450                if (dscp) {
451                        sic.dest_dscp = dscp;
452                        sic.src_dscp = sic.dest_dscp;
453                        sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
454                }
455        }
456#ifdef SFE_SUPPORT_IPV6
457        else {
458                u32 dscp;
459
460                sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
461                sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
462
463                if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) ||
464                    ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) {
465                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST);
466                        DEBUG_TRACE("multicast address\n");
467                        return NF_ACCEPT;
468                }
469
470                /*
471                 * NAT'ed addresses - note these are as seen from the 'reply' direction
472                 * When NAT does not apply to this connection these will be identical to the above.
473                 */
474                sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6);
475                sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6);
476
477                dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
478                if (dscp) {
479                        sic.dest_dscp = dscp;
480                        sic.src_dscp = sic.dest_dscp;
481                        sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
482                }
483        }
484#endif
485        switch (sic.protocol) {
486        case IPPROTO_TCP:
487                sic.src_port = orig_tuple.src.u.tcp.port;
488                sic.dest_port = orig_tuple.dst.u.tcp.port;
489                sic.src_port_xlate = reply_tuple.dst.u.tcp.port;
490                sic.dest_port_xlate = reply_tuple.src.u.tcp.port;
491                sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale;
492                sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin;
493                sic.src_td_end = ct->proto.tcp.seen[0].td_end;
494                sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend;
495                sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale;
496                sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin;
497                sic.dest_td_end = ct->proto.tcp.seen[1].td_end;
498                sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend;
499
500                if (nf_ct_tcp_no_window_check
501                    || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL)
502                    || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) {
503                        sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK;
504                }
505
506                /*
507                 * Don't try to manage a non-established connection.
508                 */
509                if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
510                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED);
511                        DEBUG_TRACE("non-established connection\n");
512                        return NF_ACCEPT;
513                }
514
515                /*
516                 * If the connection is shutting down do not manage it.
517                 * state can not be SYN_SENT, SYN_RECV because connection is assured
518                 * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE.
519                 */
520                spin_lock_bh(&ct->lock);
521                if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) {
522                        spin_unlock_bh(&ct->lock);
523                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED);
524                        DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n",
525                                    ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port),
526                                    &sic.dest_ip, ntohs(sic.dest_port));
527                        return NF_ACCEPT;
528                }
529                spin_unlock_bh(&ct->lock);
530                break;
531
532        case IPPROTO_UDP:
533                sic.src_port = orig_tuple.src.u.udp.port;
534                sic.dest_port = orig_tuple.dst.u.udp.port;
535                sic.src_port_xlate = reply_tuple.dst.u.udp.port;
536                sic.dest_port_xlate = reply_tuple.src.u.udp.port;
537                break;
538
539        default:
540                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL);
541                DEBUG_TRACE("unhandled protocol %d\n", sic.protocol);
542                return NF_ACCEPT;
543        }
544
545#ifdef CONFIG_XFRM
546        sic.original_accel = 1;
547        sic.reply_accel = 1;
548
549        /*
550         * For packets de-capsulated from xfrm, we still can accelerate it
551         * on the direction we just received the packet.
552         */
553        if (unlikely(skb->sp)) {
554                if (sic.protocol == IPPROTO_TCP &&
555                    !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) {
556                        return NF_ACCEPT;
557                }
558
559                if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
560                        sic.reply_accel = 0;
561                } else {
562                        sic.original_accel = 0;
563                }
564        }
565#endif
566
567        /*
568         * Get QoS information
569         */
570        if (skb->priority) {
571                sic.dest_priority = skb->priority;
572                sic.src_priority = sic.dest_priority;
573                sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY;
574        }
575
576        /*
577         * Get the net device and MAC addresses that correspond to the various source and
578         * destination host addresses.
579         */
580        if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev, sic.src_mac, is_v4)) {
581                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV);
582                return NF_ACCEPT;
583        }
584
585        if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) {
586                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV);
587                goto done1;
588        }
589
590        dev_put(dev);
591
592        if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) {
593                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV);
594                goto done1;
595        }
596
597        dev_put(dev);
598
599        if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev, sic.dest_mac_xlate, is_v4)) {
600                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV);
601                goto done1;
602        }
603
604        /*
605         * Our devices may actually be part of a bridge interface.  If that's
606         * the case then find the bridge interface instead.
607         */
608        if (src_dev->priv_flags & IFF_BRIDGE_PORT) {
609                src_br_dev = sfe_dev_get_master(src_dev);
610                if (!src_br_dev) {
611                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
612                        DEBUG_TRACE("no bridge found for: %s\n", src_dev->name);
613                        goto done2;
614                }
615
616                src_dev = src_br_dev;
617        }
618
619        if (dest_dev->priv_flags & IFF_BRIDGE_PORT) {
620                dest_br_dev = sfe_dev_get_master(dest_dev);
621                if (!dest_br_dev) {
622                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
623                        DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name);
624                        goto done3;
625                }
626
627                dest_dev = dest_br_dev;
628        }
629
630        sic.src_dev = src_dev;
631        sic.dest_dev = dest_dev;
632
633        sic.src_mtu = src_dev->mtu;
634        sic.dest_mtu = dest_dev->mtu;
635        sic.mark = skb->mark;
636        if (likely(is_v4)) {
637                sfe_ipv4_create_rule(&sic);
638        }
639#ifdef SFE_SUPPORT_IPV6
640        else {
641                sfe_ipv6_create_rule(&sic);
642        }
643#endif
644        /*
645         * If we had bridge ports then release them too.
646         */
647        if (dest_br_dev) {
648                dev_put(dest_br_dev);
649        }
650
651done3:
652        if (src_br_dev) {
653                dev_put(src_br_dev);
654        }
655
656done2:
657        dev_put(dest_dev);
658
659done1:
660        dev_put(src_dev);
661
662        return NF_ACCEPT;
663}
664
665/*
666 * sfe_cm_ipv4_post_routing_hook()
667 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
668 */
669sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
670{
671        return sfe_cm_post_routing(skb, true);
672}
673
674/*
675 * sfe_cm_ipv6_post_routing_hook()
676 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
677 */
678#ifdef SFE_SUPPORT_IPV6
679sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
680{
681        return sfe_cm_post_routing(skb, false);
682}
683#endif
684#ifdef CONFIG_NF_CONNTRACK_EVENTS
685/*
686 * sfe_cm_conntrack_event()
687 *      Callback event invoked when a conntrack connection's state changes.
688 */
689static int sfe_cm_conntrack_event(struct notifier_block *this,
690                                  unsigned long events, void *ptr)
691{
692        struct nf_ct_event *item = ptr;
693        struct sfe_connection_destroy sid;
694        struct nf_conn *ct = item->ct;
695        struct nf_conntrack_tuple orig_tuple;
696
697        /*
698         * If we don't have a conntrack entry then we're done.
699         */
700        if (unlikely(!ct)) {
701                DEBUG_WARN("no ct in conntrack event callback\n");
702                return NOTIFY_DONE;
703        }
704
705        /*
706         * If this is an untracked connection then we can't have any state either.
707         */
708        if (unlikely(nf_ct_is_untracked(ct))) {
709                DEBUG_TRACE("ignoring untracked conn\n");
710                return NOTIFY_DONE;
711        }
712
713        /*
714         * We're only interested in destroy events.
715         */
716        if (unlikely(!(events & (1 << IPCT_DESTROY)))) {
717                DEBUG_TRACE("ignoring non-destroy event\n");
718                return NOTIFY_DONE;
719        }
720
721        orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
722        sid.protocol = (s32)orig_tuple.dst.protonum;
723
724        /*
725         * Extract information from the conntrack connection.  We're only interested
726         * in nominal connection information (i.e. we're ignoring any NAT information).
727         */
728        switch (sid.protocol) {
729        case IPPROTO_TCP:
730                sid.src_port = orig_tuple.src.u.tcp.port;
731                sid.dest_port = orig_tuple.dst.u.tcp.port;
732                break;
733
734        case IPPROTO_UDP:
735                sid.src_port = orig_tuple.src.u.udp.port;
736                sid.dest_port = orig_tuple.dst.u.udp.port;
737                break;
738
739        default:
740                DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol);
741                return NOTIFY_DONE;
742        }
743
744        if (likely(nf_ct_l3num(ct) == AF_INET)) {
745                sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
746                sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
747
748                sfe_ipv4_destroy_rule(&sid);
749        }
750#ifdef SFE_SUPPORT_IPV6
751        else if (likely(nf_ct_l3num(ct) == AF_INET6)) {
752                sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
753                sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
754
755                sfe_ipv6_destroy_rule(&sid);
756        }
757#endif
758        else {
759                DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n");
760        }
761
762        return NOTIFY_DONE;
763}
764
765/*
766 * Netfilter conntrack event system to monitor connection tracking changes
767 */
768static struct notifier_block sfe_cm_conntrack_notifier = {
769        .notifier_call = sfe_cm_conntrack_event,
770};
771#endif
772
773/*
774 * Structure to establish a hook into the post routing netfilter point - this
775 * will pick up local outbound and packets going from one interface to another.
776 *
777 * Note: see include/linux/netfilter_ipv4.h for info related to priority levels.
778 * We want to examine packets after NAT translation and any ALG processing.
779 */
780static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = {
781        SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook),
782#ifdef SFE_SUPPORT_IPV6
783        SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook),
784#endif
785};
786
787/*
788 * sfe_cm_sync_rule()
789 *      Synchronize a connection's state.
790 */
791static void sfe_cm_sync_rule(struct sfe_connection_sync *sis)
792{
793        struct nf_conntrack_tuple_hash *h;
794        struct nf_conntrack_tuple tuple;
795        struct nf_conn *ct;
796        SFE_NF_CONN_ACCT(acct);
797
798        /*
799         * Create a tuple so as to be able to look up a connection
800         */
801        memset(&tuple, 0, sizeof(tuple));
802        tuple.src.u.all = (__be16)sis->src_port;
803        tuple.dst.dir = IP_CT_DIR_ORIGINAL;
804        tuple.dst.protonum = (u8)sis->protocol;
805        tuple.dst.u.all = (__be16)sis->dest_port;
806
807        if (sis->is_v6) {
808                tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6);
809                tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6);
810                tuple.src.l3num = AF_INET6;
811
812                DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n",
813                            (int)tuple.dst.protonum,
814                            &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all),
815                            &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all));
816        } else {
817                tuple.src.u3.ip = sis->src_ip.ip;
818                tuple.dst.u3.ip = sis->dest_ip.ip;
819                tuple.src.l3num = AF_INET;
820
821                DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n",
822                            (int)tuple.dst.protonum,
823                            &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all),
824                            &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all));
825        }
826
827        /*
828         * Look up conntrack connection
829         */
830        h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple);
831        if (unlikely(!h)) {
832                DEBUG_TRACE("no connection found\n");
833                return;
834        }
835
836        ct = nf_ct_tuplehash_to_ctrack(h);
837        NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
838
839        /*
840         * Only update if this is not a fixed timeout
841         */
842        if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
843                spin_lock_bh(&ct->lock);
844                ct->timeout.expires += sis->delta_jiffies;
845                spin_unlock_bh(&ct->lock);
846        }
847
848        acct = nf_conn_acct_find(ct);
849        if (acct) {
850                spin_lock_bh(&ct->lock);
851                atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets);
852                atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes);
853                atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets);
854                atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes);
855                spin_unlock_bh(&ct->lock);
856        }
857
858        switch (sis->protocol) {
859        case IPPROTO_TCP:
860                spin_lock_bh(&ct->lock);
861                if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) {
862                        ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window;
863                }
864                if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) {
865                        ct->proto.tcp.seen[0].td_end = sis->src_td_end;
866                }
867                if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) {
868                        ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end;
869                }
870                if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) {
871                        ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window;
872                }
873                if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) {
874                        ct->proto.tcp.seen[1].td_end = sis->dest_td_end;
875                }
876                if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) {
877                        ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end;
878                }
879                spin_unlock_bh(&ct->lock);
880                break;
881        case IPPROTO_UDP:
882                /*
883                 * In Linux connection track, UDP flow has two timeout values:
884                 * /proc/sys/net/netfilter/nf_conntrack_udp_timeout:
885                 *      this is for uni-direction UDP flow, normally its value is 60 seconds
886                 * /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream:
887                 *      this is for bi-direction UDP flow, normally its value is 180 seconds
888                 *
889                 * Linux will update timer of UDP flow to stream timeout once it seen packets
890                 * in reply direction. But if flow is accelerated by NSS or SFE, Linux won't
891                 * see any packets. So we have to do the same thing in our stats sync message.
892                 */
893                if (!test_bit(IPS_ASSURED_BIT, &ct->status) && acct) {
894                        u_int64_t reply_pkts = atomic64_read(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets);
895
896                        if (reply_pkts != 0) {
897                                struct nf_conntrack_l4proto *l4proto;
898                                unsigned int *timeouts;
899
900                                set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
901                                set_bit(IPS_ASSURED_BIT, &ct->status);
902
903                                l4proto = __nf_ct_l4proto_find((sis->is_v6 ? AF_INET6 : AF_INET), IPPROTO_UDP);
904                                timeouts = nf_ct_timeout_lookup(&init_net, ct, l4proto);
905
906                                spin_lock_bh(&ct->lock);
907                                ct->timeout.expires = jiffies + timeouts[UDP_CT_REPLIED];
908                                spin_unlock_bh(&ct->lock);
909                        }
910                }
911                break;
912        }
913
914        /*
915         * Release connection
916         */
917        nf_ct_put(ct);
918}
919
920/*
921 * sfe_cm_device_event()
922 */
923static int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr)
924{
925        struct net_device *dev = SFE_DEV_EVENT_PTR(ptr);
926
927        if (dev && (event == NETDEV_DOWN)) {
928                sfe_ipv4_destroy_all_rules_for_dev(dev);
929#ifdef SFE_SUPPORT_IPV6
930                sfe_ipv6_destroy_all_rules_for_dev(dev);
931#endif
932        }
933        return NOTIFY_DONE;
934}
935
936/*
937 * sfe_cm_inet_event()
938 */
939static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr)
940{
941        struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
942
943        if (dev && (event == NETDEV_DOWN)) {
944                sfe_ipv4_destroy_all_rules_for_dev(dev);
945        }
946
947        return NOTIFY_DONE;
948}
949
950/*
951 * sfe_cm_inet6_event()
952 */
953static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr)
954{
955        struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev;
956
957        if (dev && (event == NETDEV_DOWN)) {
958                sfe_ipv6_destroy_all_rules_for_dev(dev);
959        }
960
961        return NOTIFY_DONE;
962}
963
964/*
965 * sfe_cm_get_exceptions
966 *      dump exception counters
967 */
968static ssize_t sfe_cm_get_exceptions(struct device *dev,
969                                     struct device_attribute *attr,
970                                     char *buf)
971{
972        int idx, len;
973        struct sfe_cm *sc = &__sc;
974
975        spin_lock_bh(&sc->lock);
976        for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) {
977                if (sc->exceptions[idx]) {
978                        len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]);
979                }
980        }
981        spin_unlock_bh(&sc->lock);
982
983        return len;
984}
985
986/*
987 * sfe_cm_get_stop
988 *      dump stop
989 */
990static ssize_t sfe_cm_get_stop(struct device *dev,
991                               struct device_attribute *attr,
992                               char *buf)
993{
994        int (*fast_recv)(struct sk_buff *skb);
995        rcu_read_lock();
996        fast_recv = rcu_dereference(fast_nat_recv);
997        rcu_read_unlock();
998        return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", fast_recv ? 0 : 1);
999}
1000
1001static ssize_t sfe_cm_set_stop(struct device *dev,
1002                               struct device_attribute *attr,
1003                               const char *buf, size_t count)
1004{
1005        int ret;
1006        u32 num;
1007        int (*fast_recv)(struct sk_buff *skb);
1008
1009        ret = kstrtou32(buf, 0, &num);
1010        if (ret)
1011                return ret;
1012
1013        /*
1014         * Hook/Unhook the receive path in the network stack.
1015         */
1016        if (num) {
1017                RCU_INIT_POINTER(fast_nat_recv, NULL);
1018        } else {
1019                rcu_read_lock();
1020                fast_recv = rcu_dereference(fast_nat_recv);
1021                rcu_read_unlock();
1022                if (!fast_recv) {
1023                        BUG_ON(fast_nat_recv);
1024                        RCU_INIT_POINTER(fast_nat_recv, sfe_cm_recv);
1025                }
1026        }
1027
1028        DEBUG_TRACE("sfe_cm_stop = %d\n", num);
1029        return count;
1030}
1031
1032/*
1033 * sfe_cm_get_defunct_all
1034 *      dump state of SFE
1035 */
1036static ssize_t sfe_cm_get_defunct_all(struct device *dev,
1037                                      struct device_attribute *attr,
1038                                      char *buf)
1039{
1040        return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", 0);
1041}
1042
1043static ssize_t sfe_cm_set_defunct_all(struct device *dev,
1044                                      struct device_attribute *attr,
1045                                      const char *buf, size_t count)
1046{
1047        sfe_ipv4_destroy_all_rules_for_dev(NULL);
1048#ifdef SFE_SUPPORT_IPV6
1049        sfe_ipv6_destroy_all_rules_for_dev(NULL);
1050#endif
1051        return count;
1052}
1053
1054/*
1055 * sysfs attributes.
1056 */
1057static const struct device_attribute sfe_attrs[] = {
1058        __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL),
1059        __ATTR(stop, S_IWUSR | S_IRUGO, sfe_cm_get_stop, sfe_cm_set_stop),
1060        __ATTR(defunct_all, S_IWUSR | S_IRUGO, sfe_cm_get_defunct_all, sfe_cm_set_defunct_all),
1061};
1062
1063/*
1064 * sfe_cm_init()
1065 */
1066static int __init sfe_cm_init(void)
1067{
1068        struct sfe_cm *sc = &__sc;
1069        int result = -1;
1070        size_t i, j;
1071
1072#ifdef SFE_SUPPORT_IPV6
1073        sfe_ipv6_init();
1074#endif
1075        sfe_ipv4_init();
1076
1077        DEBUG_INFO("SFE CM init\n");
1078
1079        /*
1080         * Create sys/sfe_cm
1081         */
1082        sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL);
1083        if (!sc->sys_sfe_cm) {
1084                DEBUG_ERROR("failed to register sfe_cm\n");
1085                goto exit1;
1086        }
1087
1088        for (i = 0; i < ARRAY_SIZE(sfe_attrs); i++) {
1089                result = sysfs_create_file(sc->sys_sfe_cm, &sfe_attrs[i].attr);
1090                if (result) {
1091                        DEBUG_ERROR("failed to register %s : %d\n",
1092                                    sfe_attrs[i].attr.name, result);
1093                        goto exit2;
1094                }
1095        }
1096
1097        sc->dev_notifier.notifier_call = sfe_cm_device_event;
1098        sc->dev_notifier.priority = 1;
1099        register_netdevice_notifier(&sc->dev_notifier);
1100
1101        sc->inet_notifier.notifier_call = sfe_cm_inet_event;
1102        sc->inet_notifier.priority = 1;
1103        register_inetaddr_notifier(&sc->inet_notifier);
1104#ifdef SFE_SUPPORT_IPV6
1105        if (register_inet6addr_notifier) {
1106                sc->inet6_notifier.notifier_call = sfe_cm_inet6_event;
1107                sc->inet6_notifier.priority = 1;
1108                register_inet6addr_notifier(&sc->inet6_notifier);
1109        }
1110#endif
1111        /*
1112         * Register our netfilter hooks.
1113         */
1114        result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1115        if (result < 0) {
1116                DEBUG_ERROR("can't register nf post routing hook: %d\n", result);
1117                goto exit3;
1118        }
1119
1120#ifdef CONFIG_NF_CONNTRACK_EVENTS
1121        /*
1122         * Register a notifier hook to get fast notifications of expired connections.
1123         */
1124        result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier);
1125        if (result < 0) {
1126                DEBUG_ERROR("can't register nf notifier hook: %d\n", result);
1127                goto exit4;
1128        }
1129#endif
1130
1131        spin_lock_init(&sc->lock);
1132
1133        /*
1134         * Hook the shortcut sync callback.
1135         */
1136        sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule);
1137#ifdef SFE_SUPPORT_IPV6
1138        sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule);
1139#endif
1140        fast_classifier_init();
1141
1142        return 0;
1143
1144#ifdef CONFIG_NF_CONNTRACK_EVENTS
1145exit4:
1146        nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1147#endif
1148exit3:
1149#ifdef SFE_SUPPORT_IPV6
1150        if (unregister_inet6addr_notifier) {
1151                unregister_inet6addr_notifier(&sc->inet6_notifier);
1152        }
1153#endif
1154        unregister_inetaddr_notifier(&sc->inet_notifier);
1155        unregister_netdevice_notifier(&sc->dev_notifier);
1156exit2:
1157        for (j = 0; j < i; j++) {
1158                sysfs_remove_file(sc->sys_sfe_cm, &sfe_attrs[j].attr);
1159        }
1160        kobject_put(sc->sys_sfe_cm);
1161
1162exit1:
1163        sfe_ipv4_exit();
1164#ifdef SFE_SUPPORT_IPV6
1165        sfe_ipv6_exit();
1166#endif
1167
1168        return result;
1169}
1170
1171/*
1172 * sfe_cm_exit()
1173 */
1174static void __exit sfe_cm_exit(void)
1175{
1176        struct sfe_cm *sc = &__sc;
1177
1178        DEBUG_INFO("SFE CM exit\n");
1179        fast_classifier_exit();
1180
1181        /*
1182         * Unregister our sync callback.
1183         */
1184        sfe_ipv4_register_sync_rule_callback(NULL);
1185#ifdef SFE_SUPPORT_IPV6
1186        sfe_ipv6_register_sync_rule_callback(NULL);
1187#endif
1188        /*
1189         * Unregister our receive callback.
1190         */
1191        RCU_INIT_POINTER(fast_nat_recv, NULL);
1192
1193        /*
1194         * Wait for all callbacks to complete.
1195         */
1196        rcu_barrier();
1197
1198        /*
1199         * Destroy all connections.
1200         */
1201        sfe_ipv4_destroy_all_rules_for_dev(NULL);
1202#ifdef SFE_SUPPORT_IPV6
1203        sfe_ipv6_destroy_all_rules_for_dev(NULL);
1204#endif
1205#ifdef CONFIG_NF_CONNTRACK_EVENTS
1206        nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier);
1207
1208#endif
1209        nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1210
1211#ifdef SFE_SUPPORT_IPV6
1212        if (unregister_inet6addr_notifier) {
1213                unregister_inet6addr_notifier(&sc->inet6_notifier);
1214        }
1215#endif
1216        unregister_inetaddr_notifier(&sc->inet_notifier);
1217        unregister_netdevice_notifier(&sc->dev_notifier);
1218
1219        kobject_put(sc->sys_sfe_cm);
1220        sfe_ipv4_exit();
1221#ifdef SFE_SUPPORT_IPV6
1222        sfe_ipv6_exit();
1223#endif
1224}
1225
1226module_init(sfe_cm_init)
1227module_exit(sfe_cm_exit)
1228
1229MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager");
1230MODULE_LICENSE("Dual BSD/GPL");
1231
Note: See TracBrowser for help on using the repository browser.