source: src/linux/universal/linux-4.9/net/shortcut-fe/sfe_cm.c @ 32718

Last change on this file since 32718 was 32718, checked in by brainslayer, 5 weeks ago

remerge code

File size: 31.5 KB
Line 
1/*
2 * sfe-cm.c
3 *      Shortcut forwarding engine connection manager.
4 *
5 * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved.
6 * Permission to use, copy, modify, and/or distribute this software for
7 * any purpose with or without fee is hereby granted, provided that the
8 * above copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
15 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <linux/module.h>
19#include <linux/sysfs.h>
20#include <linux/skbuff.h>
21#include <net/route.h>
22#include <net/ip6_route.h>
23#include <net/addrconf.h>
24#include <net/dsfield.h>
25#include <linux/inetdevice.h>
26#include <linux/netfilter_bridge.h>
27#include <linux/netfilter_ipv6.h>
28#include <net/netfilter/nf_conntrack_acct.h>
29#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_zones.h>
31#include <net/netfilter/nf_conntrack_core.h>
32#include <net/netfilter/nf_conntrack_timeout.h>
33#include <linux/netfilter/xt_dscp.h>
34#include <linux/if_bridge.h>
35
36#include "sfe.h"
37#include "sfe_cm.h"
38#include "sfe_backport.h"
39
40#include "sfe_ipv4.c"
41#ifdef SFE_SUPPORT_IPV6
42#include "sfe_ipv6.c"
43#endif
44#include "fast-classifier.c"
45
46typedef enum sfe_cm_exception {
47        SFE_CM_EXCEPTION_PACKET_BROADCAST,
48        SFE_CM_EXCEPTION_PACKET_MULTICAST,
49        SFE_CM_EXCEPTION_NO_IIF,
50        SFE_CM_EXCEPTION_NO_CT,
51        SFE_CM_EXCEPTION_CT_NO_TRACK,
52        SFE_CM_EXCEPTION_CT_NO_CONFIRM,
53        SFE_CM_EXCEPTION_CT_IS_ALG,
54        SFE_CM_EXCEPTION_IS_IPV4_MCAST,
55        SFE_CM_EXCEPTION_IS_IPV6_MCAST,
56        SFE_CM_EXCEPTION_TCP_NOT_ASSURED,
57        SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED,
58        SFE_CM_EXCEPTION_UNKNOW_PROTOCOL,
59        SFE_CM_EXCEPTION_NO_SRC_DEV,
60        SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV,
61        SFE_CM_EXCEPTION_NO_DEST_DEV,
62        SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV,
63        SFE_CM_EXCEPTION_NO_BRIDGE,
64        SFE_CM_EXCEPTION_LOCAL_OUT,
65        SFE_CM_EXCEPTION_MAX
66} sfe_cm_exception_t;
67
68static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = {
69        "PACKET_BROADCAST",
70        "PACKET_MULTICAST",
71        "NO_IIF",
72        "NO_CT",
73        "CT_NO_TRACK",
74        "CT_NO_CONFIRM",
75        "CT_IS_ALG",
76        "IS_IPV4_MCAST",
77        "IS_IPV6_MCAST",
78        "TCP_NOT_ASSURED",
79        "TCP_NOT_ESTABLISHED",
80        "UNKNOW_PROTOCOL",
81        "NO_SRC_DEV",
82        "NO_SRC_XLATE_DEV",
83        "NO_DEST_DEV",
84        "NO_DEST_XLATE_DEV",
85        "NO_BRIDGE",
86        "LOCAL_OUT"
87};
88
89/*
90 * Per-module structure.
91 */
92struct sfe_cm {
93        spinlock_t lock;                /* Lock for SMP correctness */
94
95        /*
96         * Control state.
97         */
98        struct kobject *sys_sfe_cm;     /* sysfs linkage */
99
100        /*
101         * Callback notifiers.
102         */
103        struct notifier_block dev_notifier;     /* Device notifier */
104        struct notifier_block inet_notifier;    /* IPv4 notifier */
105#ifdef SFE_SUPPORT_IPV6
106        struct notifier_block inet6_notifier;   /* IPv6 notifier */
107#endif
108        u32 exceptions[SFE_CM_EXCEPTION_MAX];
109};
110
111static struct sfe_cm __sc;
112
113
114/*
115 * sfe_cm_incr_exceptions()
116 *      increase an exception counter.
117 */
118static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except)
119{
120        struct sfe_cm *sc = &__sc;
121
122        spin_lock_bh(&sc->lock);
123        sc->exceptions[except]++;
124        spin_unlock_bh(&sc->lock);
125}
126
127/*
128 * sfe_cm_recv()
129 *      Handle packet receives.
130 *
131 * Returns 1 if the packet is forwarded or 0 if it isn't.
132 */
133static int sfe_cm_recv(struct sk_buff *skb)
134{
135        struct net_device *dev;
136
137        /*
138         * We know that for the vast majority of packets we need the transport
139         * layer header so we may as well start to fetch it now!
140         */
141        prefetch(skb->data + 32);
142        barrier();
143
144        dev = skb->dev;
145
146        /*
147         * We're only interested in IPv4 and IPv6 packets.
148         */
149        if (likely(htons(ETH_P_IP) == skb->protocol)) {
150                struct in_device *in_dev;
151
152                /*
153                 * Does our input device support IP processing?
154                 */
155                in_dev = (struct in_device *)dev->ip_ptr;
156                if (unlikely(!in_dev)) {
157                        DEBUG_TRACE("no IP processing for device: %s\n", dev->name);
158                        return 0;
159                }
160
161                /*
162                 * Does it have an IP address?  If it doesn't then we can't do anything
163                 * interesting here!
164                 */
165                if (unlikely(!in_dev->ifa_list)) {
166                        DEBUG_TRACE("no IP address for device: %s\n", dev->name);
167                        return 0;
168                }
169
170                return sfe_ipv4_recv(dev, skb);
171        }
172
173#ifdef SFE_SUPPORT_IPV6
174        if (likely(htons(ETH_P_IPV6) == skb->protocol)) {
175                struct inet6_dev *in_dev;
176
177                /*
178                 * Does our input device support IPv6 processing?
179                 */
180                in_dev = (struct inet6_dev *)dev->ip6_ptr;
181                if (unlikely(!in_dev)) {
182                        DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name);
183                        return 0;
184                }
185
186                /*
187                 * Does it have an IPv6 address?  If it doesn't then we can't do anything
188                 * interesting here!
189                 */
190                if (unlikely(list_empty(&in_dev->addr_list))) {
191                        DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name);
192                        return 0;
193                }
194
195                return sfe_ipv6_recv(dev, skb);
196        }
197#endif
198        DEBUG_TRACE("not IP packet\n");
199        return 0;
200}
201
202/*
203 * sfe_cm_find_dev_and_mac_addr()
204 *      Find the device and MAC address for a given IPv4/IPv6 address.
205 *
206 * Returns true if we find the device and MAC address, otherwise false.
207 *
208 * We look up the rtable entry for the address and, from its neighbour
209 * structure, obtain the hardware address.  This means this function also
210 * works if the neighbours are routers too.
211 */
212static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4)
213{
214        struct neighbour *neigh;
215        struct rtable *rt;
216        struct rt6_info *rt6 = NULL;
217        struct dst_entry *dst;
218        struct net_device *mac_dev;
219
220        /*
221         * Look up the rtable entry for the IP address then get the hardware
222         * address from its neighbour structure.  This means this work when the
223         * neighbours are routers too.
224         */
225        if (likely(is_v4)) {
226                rt = ip_route_output(&init_net, addr->ip, 0, 0, 0);
227                if (unlikely(IS_ERR(rt))) {
228                        goto ret_fail;
229                }
230
231                dst = (struct dst_entry *)rt;
232        } else {
233                if (rt6_lookup)
234                    rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0);
235                if (!rt6) {
236                        goto ret_fail;
237                }
238
239                dst = (struct dst_entry *)rt6;
240        }
241
242        rcu_read_lock();
243        neigh = dst_neigh_lookup(dst, addr);
244        if (unlikely(!neigh)) {
245                rcu_read_unlock();
246                dst_release(dst);
247                goto ret_fail;
248        }
249
250        if (unlikely(!(neigh->nud_state & NUD_VALID))) {
251                rcu_read_unlock();
252                neigh_release(neigh);
253                dst_release(dst);
254                goto ret_fail;
255        }
256
257        mac_dev = neigh->dev;
258        if (!mac_dev) {
259                rcu_read_unlock();
260                neigh_release(neigh);
261                dst_release(dst);
262                goto ret_fail;
263        }
264
265        memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len);
266
267        dev_hold(mac_dev);
268        *dev = mac_dev;
269        rcu_read_unlock();
270        neigh_release(neigh);
271        dst_release(dst);
272
273        return true;
274
275ret_fail:
276        if (is_v4) {
277                DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip);
278
279        } else {
280                DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6);
281        }
282
283        return false;
284}
285
286EXPORT_SYMBOL(sfe_cm_find_dev_and_mac_addr);
287/*
288 * sfe_cm_post_routing()
289 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
290 */
291static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4)
292{
293        struct sfe_connection_create sic;
294        struct net_device *in;
295        struct nf_conn *ct;
296        enum ip_conntrack_info ctinfo;
297        struct net_device *dev;
298        struct net_device *src_dev;
299        struct net_device *dest_dev;
300        struct net_device *src_br_dev = NULL;
301        struct net_device *dest_br_dev = NULL;
302        struct nf_conntrack_tuple orig_tuple;
303        struct nf_conntrack_tuple reply_tuple;
304        SFE_NF_CONN_ACCT(acct);
305
306        /*
307         * Don't process broadcast or multicast packets.
308         */
309        if (unlikely(skb->pkt_type == PACKET_BROADCAST)) {
310                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST);
311                DEBUG_TRACE("broadcast, ignoring\n");
312                return NF_ACCEPT;
313        }
314        if (unlikely(skb->pkt_type == PACKET_MULTICAST)) {
315                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST);
316                DEBUG_TRACE("multicast, ignoring\n");
317                return NF_ACCEPT;
318        }
319
320#ifdef CONFIG_XFRM
321        /*
322         * Packet to xfrm for encapsulation, we can't process it
323         */
324        if (unlikely(skb_dst(skb)->xfrm)) {
325                DEBUG_TRACE("packet to xfrm, ignoring\n");
326                return NF_ACCEPT;
327        }
328#endif
329
330        /*
331         * Don't process locally generated packets.
332         */
333        if (skb->sk) {
334                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT);
335                DEBUG_TRACE("skip local out packet\n");
336                return NF_ACCEPT;
337        }
338
339        /*
340         * Don't process packets that are not being forwarded.
341         */
342        in = dev_get_by_index(&init_net, skb->skb_iif);
343        if (!in) {
344                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF);
345                DEBUG_TRACE("packet not forwarding\n");
346                return NF_ACCEPT;
347        }
348
349        dev_put(in);
350
351        /*
352         * Don't process packets that aren't being tracked by conntrack.
353         */
354        ct = nf_ct_get(skb, &ctinfo);
355        if (unlikely(!ct)) {
356                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT);
357                DEBUG_TRACE("no conntrack connection, ignoring\n");
358                return NF_ACCEPT;
359        }
360
361        /*
362         * Don't process untracked connections.
363         */
364        if (unlikely(nf_ct_is_untracked(ct))) {
365                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK);
366                DEBUG_TRACE("untracked connection\n");
367                return NF_ACCEPT;
368        }
369
370        /*
371         * Unconfirmed connection may be dropped by Linux at the final step,
372         * So we don't process unconfirmed connections.
373         */
374        if (!nf_ct_is_confirmed(ct)) {
375                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM);
376                DEBUG_TRACE("unconfirmed connection\n");
377                return NF_ACCEPT;
378        }
379
380        /*
381         * Don't process connections that require support from a 'helper' (typically a NAT ALG).
382         */
383        if (unlikely(nfct_help(ct))) {
384                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG);
385                DEBUG_TRACE("connection has helper\n");
386                return NF_ACCEPT;
387        }
388
389        /*
390         * Check if the acceleration of a flow could be rejected quickly.
391         */
392        acct = nf_conn_acct_find(ct);
393        if (acct) {
394                long long packets = atomic64_read(&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets);
395                if ((packets > 0xff) && (packets & 0xff)) {
396                        /*
397                         * Connection hits slow path at least 256 times, so it must be not able to accelerate.
398                         * But we also give it a chance to walk through ECM every 256 packets
399                         */
400                        return NF_ACCEPT;
401                }
402        }
403
404        /*
405         * Look up the details of our connection in conntrack.
406         *
407         * Note that the data we get from conntrack is for the "ORIGINAL" direction
408         * but our packet may actually be in the "REPLY" direction.
409         */
410        orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
411        reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
412        sic.protocol = (s32)orig_tuple.dst.protonum;
413
414        sic.flags = 0;
415
416        /*
417         * Get addressing information, non-NAT first
418         */
419        if (likely(is_v4)) {
420                u32 dscp;
421
422                sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
423                sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
424
425                if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) {
426                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST);
427                        DEBUG_TRACE("multicast address\n");
428                        return NF_ACCEPT;
429                }
430
431                /*
432                 * NAT'ed addresses - note these are as seen from the 'reply' direction
433                 * When NAT does not apply to this connection these will be identical to the above.
434                 */
435                sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip;
436                sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip;
437
438                dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
439                if (dscp) {
440                        sic.dest_dscp = dscp;
441                        sic.src_dscp = sic.dest_dscp;
442                        sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
443                }
444        }
445#ifdef SFE_SUPPORT_IPV6
446        else {
447                u32 dscp;
448
449                sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
450                sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
451
452                if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) ||
453                    ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) {
454                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST);
455                        DEBUG_TRACE("multicast address\n");
456                        return NF_ACCEPT;
457                }
458
459                /*
460                 * NAT'ed addresses - note these are as seen from the 'reply' direction
461                 * When NAT does not apply to this connection these will be identical to the above.
462                 */
463                sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6);
464                sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6);
465
466                dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
467                if (dscp) {
468                        sic.dest_dscp = dscp;
469                        sic.src_dscp = sic.dest_dscp;
470                        sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
471                }
472        }
473#endif
474        switch (sic.protocol) {
475        case IPPROTO_TCP:
476                sic.src_port = orig_tuple.src.u.tcp.port;
477                sic.dest_port = orig_tuple.dst.u.tcp.port;
478                sic.src_port_xlate = reply_tuple.dst.u.tcp.port;
479                sic.dest_port_xlate = reply_tuple.src.u.tcp.port;
480                sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale;
481                sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin;
482                sic.src_td_end = ct->proto.tcp.seen[0].td_end;
483                sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend;
484                sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale;
485                sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin;
486                sic.dest_td_end = ct->proto.tcp.seen[1].td_end;
487                sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend;
488
489                if (nf_ct_tcp_no_window_check
490                    || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL)
491                    || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) {
492                        sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK;
493                }
494
495                /*
496                 * Don't try to manage a non-established connection.
497                 */
498                if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
499                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED);
500                        DEBUG_TRACE("non-established connection\n");
501                        return NF_ACCEPT;
502                }
503
504                /*
505                 * If the connection is shutting down do not manage it.
506                 * state can not be SYN_SENT, SYN_RECV because connection is assured
507                 * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE.
508                 */
509                spin_lock_bh(&ct->lock);
510                if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) {
511                        spin_unlock_bh(&ct->lock);
512                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED);
513                        DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n",
514                                    ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port),
515                                    &sic.dest_ip, ntohs(sic.dest_port));
516                        return NF_ACCEPT;
517                }
518                spin_unlock_bh(&ct->lock);
519                break;
520
521        case IPPROTO_UDP:
522                sic.src_port = orig_tuple.src.u.udp.port;
523                sic.dest_port = orig_tuple.dst.u.udp.port;
524                sic.src_port_xlate = reply_tuple.dst.u.udp.port;
525                sic.dest_port_xlate = reply_tuple.src.u.udp.port;
526                break;
527
528        default:
529                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL);
530                DEBUG_TRACE("unhandled protocol %d\n", sic.protocol);
531                return NF_ACCEPT;
532        }
533
534#ifdef CONFIG_XFRM
535        sic.original_accel = 1;
536        sic.reply_accel = 1;
537
538        /*
539         * For packets de-capsulated from xfrm, we still can accelerate it
540         * on the direction we just received the packet.
541         */
542        if (unlikely(skb->sp)) {
543                if (sic.protocol == IPPROTO_TCP &&
544                    !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) {
545                        return NF_ACCEPT;
546                }
547
548                if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
549                        sic.reply_accel = 0;
550                } else {
551                        sic.original_accel = 0;
552                }
553        }
554#endif
555
556        /*
557         * Get QoS information
558         */
559        if (skb->priority) {
560                sic.dest_priority = skb->priority;
561                sic.src_priority = sic.dest_priority;
562                sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY;
563        }
564
565        /*
566         * Get the net device and MAC addresses that correspond to the various source and
567         * destination host addresses.
568         */
569        if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev, sic.src_mac, is_v4)) {
570                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV);
571                return NF_ACCEPT;
572        }
573
574        if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) {
575                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV);
576                goto done1;
577        }
578
579        dev_put(dev);
580
581        if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) {
582                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV);
583                goto done1;
584        }
585
586        dev_put(dev);
587
588        if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev, sic.dest_mac_xlate, is_v4)) {
589                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV);
590                goto done1;
591        }
592
593        /*
594         * Our devices may actually be part of a bridge interface.  If that's
595         * the case then find the bridge interface instead.
596         */
597        if (src_dev->priv_flags & IFF_BRIDGE_PORT) {
598                src_br_dev = sfe_dev_get_master(src_dev);
599                if (!src_br_dev) {
600                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
601                        DEBUG_TRACE("no bridge found for: %s\n", src_dev->name);
602                        goto done2;
603                }
604
605                src_dev = src_br_dev;
606        }
607
608        if (dest_dev->priv_flags & IFF_BRIDGE_PORT) {
609                dest_br_dev = sfe_dev_get_master(dest_dev);
610                if (!dest_br_dev) {
611                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
612                        DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name);
613                        goto done3;
614                }
615
616                dest_dev = dest_br_dev;
617        }
618
619        sic.src_dev = src_dev;
620        sic.dest_dev = dest_dev;
621
622        sic.src_mtu = src_dev->mtu;
623        sic.dest_mtu = dest_dev->mtu;
624        sic.mark = skb->mark;
625        if (likely(is_v4)) {
626                sfe_ipv4_create_rule(&sic);
627        }
628#ifdef SFE_SUPPORT_IPV6
629        else {
630                sfe_ipv6_create_rule(&sic);
631        }
632#endif
633        /*
634         * If we had bridge ports then release them too.
635         */
636        if (dest_br_dev) {
637                dev_put(dest_br_dev);
638        }
639
640done3:
641        if (src_br_dev) {
642                dev_put(src_br_dev);
643        }
644
645done2:
646        dev_put(dest_dev);
647
648done1:
649        dev_put(src_dev);
650
651        return NF_ACCEPT;
652}
653
654/*
655 * sfe_cm_ipv4_post_routing_hook()
656 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
657 */
658sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
659{
660        return sfe_cm_post_routing(skb, true);
661}
662
663/*
664 * sfe_cm_ipv6_post_routing_hook()
665 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
666 */
667#ifdef SFE_SUPPORT_IPV6
668sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
669{
670        return sfe_cm_post_routing(skb, false);
671}
672#endif
673#ifdef CONFIG_NF_CONNTRACK_EVENTS
674/*
675 * sfe_cm_conntrack_event()
676 *      Callback event invoked when a conntrack connection's state changes.
677 */
678static int sfe_cm_conntrack_event(struct notifier_block *this,
679                                  unsigned long events, void *ptr)
680{
681        struct nf_ct_event *item = ptr;
682        struct sfe_connection_destroy sid;
683        struct nf_conn *ct = item->ct;
684        struct nf_conntrack_tuple orig_tuple;
685
686        /*
687         * If we don't have a conntrack entry then we're done.
688         */
689        if (unlikely(!ct)) {
690                DEBUG_WARN("no ct in conntrack event callback\n");
691                return NOTIFY_DONE;
692        }
693
694        /*
695         * If this is an untracked connection then we can't have any state either.
696         */
697        if (unlikely(nf_ct_is_untracked(ct))) {
698                DEBUG_TRACE("ignoring untracked conn\n");
699                return NOTIFY_DONE;
700        }
701
702        /*
703         * We're only interested in destroy events.
704         */
705        if (unlikely(!(events & (1 << IPCT_DESTROY)))) {
706                DEBUG_TRACE("ignoring non-destroy event\n");
707                return NOTIFY_DONE;
708        }
709
710        orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
711        sid.protocol = (s32)orig_tuple.dst.protonum;
712
713        /*
714         * Extract information from the conntrack connection.  We're only interested
715         * in nominal connection information (i.e. we're ignoring any NAT information).
716         */
717        switch (sid.protocol) {
718        case IPPROTO_TCP:
719                sid.src_port = orig_tuple.src.u.tcp.port;
720                sid.dest_port = orig_tuple.dst.u.tcp.port;
721                break;
722
723        case IPPROTO_UDP:
724                sid.src_port = orig_tuple.src.u.udp.port;
725                sid.dest_port = orig_tuple.dst.u.udp.port;
726                break;
727
728        default:
729                DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol);
730                return NOTIFY_DONE;
731        }
732
733        if (likely(nf_ct_l3num(ct) == AF_INET)) {
734                sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
735                sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
736
737                sfe_ipv4_destroy_rule(&sid);
738        }
739#ifdef SFE_SUPPORT_IPV6
740        else if (likely(nf_ct_l3num(ct) == AF_INET6)) {
741                sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
742                sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
743
744                sfe_ipv6_destroy_rule(&sid);
745        }
746#endif
747        else {
748                DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n");
749        }
750
751        return NOTIFY_DONE;
752}
753
754/*
755 * Netfilter conntrack event system to monitor connection tracking changes
756 */
757static struct notifier_block sfe_cm_conntrack_notifier = {
758        .notifier_call = sfe_cm_conntrack_event,
759};
760#endif
761
762/*
763 * Structure to establish a hook into the post routing netfilter point - this
764 * will pick up local outbound and packets going from one interface to another.
765 *
766 * Note: see include/linux/netfilter_ipv4.h for info related to priority levels.
767 * We want to examine packets after NAT translation and any ALG processing.
768 */
769static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = {
770        SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook),
771#ifdef SFE_SUPPORT_IPV6
772        SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook),
773#endif
774};
775
776/*
777 * sfe_cm_sync_rule()
778 *      Synchronize a connection's state.
779 */
780static void sfe_cm_sync_rule(struct sfe_connection_sync *sis)
781{
782        struct nf_conntrack_tuple_hash *h;
783        struct nf_conntrack_tuple tuple;
784        struct nf_conn *ct;
785        SFE_NF_CONN_ACCT(acct);
786
787        /*
788         * Create a tuple so as to be able to look up a connection
789         */
790        memset(&tuple, 0, sizeof(tuple));
791        tuple.src.u.all = (__be16)sis->src_port;
792        tuple.dst.dir = IP_CT_DIR_ORIGINAL;
793        tuple.dst.protonum = (u8)sis->protocol;
794        tuple.dst.u.all = (__be16)sis->dest_port;
795
796        if (sis->is_v6) {
797                tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6);
798                tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6);
799                tuple.src.l3num = AF_INET6;
800
801                DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n",
802                            (int)tuple.dst.protonum,
803                            &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all),
804                            &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all));
805        } else {
806                tuple.src.u3.ip = sis->src_ip.ip;
807                tuple.dst.u3.ip = sis->dest_ip.ip;
808                tuple.src.l3num = AF_INET;
809
810                DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n",
811                            (int)tuple.dst.protonum,
812                            &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all),
813                            &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all));
814        }
815
816        /*
817         * Look up conntrack connection
818         */
819        h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple);
820        if (unlikely(!h)) {
821                DEBUG_TRACE("no connection found\n");
822                return;
823        }
824
825        ct = nf_ct_tuplehash_to_ctrack(h);
826        NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
827
828        /*
829         * Only update if this is not a fixed timeout
830         */
831        if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
832                spin_lock_bh(&ct->lock);
833                ct->timeout += sis->delta_jiffies;
834                spin_unlock_bh(&ct->lock);
835        }
836
837        acct = nf_conn_acct_find(ct);
838        if (acct) {
839                spin_lock_bh(&ct->lock);
840                atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets);
841                atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes);
842                atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets);
843                atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes);
844                spin_unlock_bh(&ct->lock);
845        }
846
847        switch (sis->protocol) {
848        case IPPROTO_TCP:
849                spin_lock_bh(&ct->lock);
850                if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) {
851                        ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window;
852                }
853                if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) {
854                        ct->proto.tcp.seen[0].td_end = sis->src_td_end;
855                }
856                if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) {
857                        ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end;
858                }
859                if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) {
860                        ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window;
861                }
862                if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) {
863                        ct->proto.tcp.seen[1].td_end = sis->dest_td_end;
864                }
865                if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) {
866                        ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end;
867                }
868                spin_unlock_bh(&ct->lock);
869                break;
870        case IPPROTO_UDP:
871                /*
872                 * In Linux connection track, UDP flow has two timeout values:
873                 * /proc/sys/net/netfilter/nf_conntrack_udp_timeout:
874                 *      this is for uni-direction UDP flow, normally its value is 60 seconds
875                 * /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream:
876                 *      this is for bi-direction UDP flow, normally its value is 180 seconds
877                 *
878                 * Linux will update timer of UDP flow to stream timeout once it seen packets
879                 * in reply direction. But if flow is accelerated by NSS or SFE, Linux won't
880                 * see any packets. So we hAave to do the same thing in our stats sync message.
881                 */
882                if (!test_bit(IPS_ASSURED_BIT, &ct->status) && acct) {
883                        u_int64_t reply_pkts = atomic64_read(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets);
884
885                        if (reply_pkts != 0) {
886                                struct nf_conntrack_l4proto *l4proto;
887                                unsigned int *timeouts;
888
889                                set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
890                                set_bit(IPS_ASSURED_BIT, &ct->status);
891
892                                l4proto = __nf_ct_l4proto_find((sis->is_v6 ? AF_INET6 : AF_INET), IPPROTO_UDP);
893                                timeouts = nf_ct_timeout_lookup(&init_net, ct, l4proto);
894
895                                spin_lock_bh(&ct->lock);
896                                ct->timeout = nfct_time_stamp + timeouts[UDP_CT_REPLIED];
897                                spin_unlock_bh(&ct->lock);
898                        }
899                }
900                break;
901        }
902
903        /*
904         * Release connection
905         */
906        nf_ct_put(ct);
907}
908
909/*
910 * sfe_cm_device_event()
911 */
912static int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr)
913{
914        struct net_device *dev = SFE_DEV_EVENT_PTR(ptr);
915
916        if (dev && (event == NETDEV_DOWN)) {
917                sfe_ipv4_destroy_all_rules_for_dev(dev);
918#ifdef SFE_SUPPORT_IPV6
919                sfe_ipv6_destroy_all_rules_for_dev(dev);
920#endif
921        }
922        return NOTIFY_DONE;
923}
924
925/*
926 * sfe_cm_inet_event()
927 */
928static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr)
929{
930        struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
931
932        if (dev && (event == NETDEV_DOWN)) {
933                sfe_ipv4_destroy_all_rules_for_dev(dev);
934        }
935
936        return NOTIFY_DONE;
937}
938
939/*
940 * sfe_cm_inet6_event()
941 */
942static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr)
943{
944        struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev;
945
946        if (dev && (event == NETDEV_DOWN)) {
947                sfe_ipv6_destroy_all_rules_for_dev(dev);
948        }
949
950        return NOTIFY_DONE;
951}
952
953/*
954 * sfe_cm_get_exceptions
955 *      dump exception counters
956 */
957static ssize_t sfe_cm_get_exceptions(struct device *dev,
958                                     struct device_attribute *attr,
959                                     char *buf)
960{
961        int idx, len;
962        struct sfe_cm *sc = &__sc;
963
964        spin_lock_bh(&sc->lock);
965        for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) {
966                if (sc->exceptions[idx]) {
967                        len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]);
968                }
969        }
970        spin_unlock_bh(&sc->lock);
971
972        return len;
973}
974
975/*
976 * sfe_cm_get_stop
977 *      dump stop
978 */
979static ssize_t sfe_cm_get_stop(struct device *dev,
980                               struct device_attribute *attr,
981                               char *buf)
982{
983        int (*fast_recv)(struct sk_buff *skb);
984        rcu_read_lock();
985        fast_recv = rcu_dereference(fast_nat_recv);
986        rcu_read_unlock();
987        return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", fast_recv ? 0 : 1);
988}
989
990static ssize_t sfe_cm_set_stop(struct device *dev,
991                               struct device_attribute *attr,
992                               const char *buf, size_t count)
993{
994        int ret;
995        u32 num;
996        int (*fast_recv)(struct sk_buff *skb);
997
998        ret = kstrtou32(buf, 0, &num);
999        if (ret)
1000                return ret;
1001
1002        /*
1003         * Hook/Unhook the receive path in the network stack.
1004         */
1005        if (num) {
1006                RCU_INIT_POINTER(fast_nat_recv, NULL);
1007        } else {
1008                rcu_read_lock();
1009                fast_recv = rcu_dereference(fast_nat_recv);
1010                rcu_read_unlock();
1011                if (!fast_recv) {
1012                        BUG_ON(fast_nat_recv);
1013                        RCU_INIT_POINTER(fast_nat_recv, sfe_cm_recv);
1014                }
1015        }
1016
1017        DEBUG_TRACE("sfe_cm_stop = %d\n", num);
1018        return count;
1019}
1020
1021/*
1022 * sfe_cm_get_defunct_all
1023 *      dump state of SFE
1024 */
1025static ssize_t sfe_cm_get_defunct_all(struct device *dev,
1026                                      struct device_attribute *attr,
1027                                      char *buf)
1028{
1029        return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", 0);
1030}
1031
1032static ssize_t sfe_cm_set_defunct_all(struct device *dev,
1033                                      struct device_attribute *attr,
1034                                      const char *buf, size_t count)
1035{
1036        sfe_ipv4_destroy_all_rules_for_dev(NULL);
1037#ifdef SFE_SUPPORT_IPV6
1038        sfe_ipv6_destroy_all_rules_for_dev(NULL);
1039#endif
1040        return count;
1041}
1042
1043/*
1044 * sysfs attributes.
1045 */
1046static const struct device_attribute sfe_attrs[] = {
1047        __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL),
1048        __ATTR(stop, S_IWUSR | S_IRUGO, sfe_cm_get_stop, sfe_cm_set_stop),
1049        __ATTR(defunct_all, S_IWUSR | S_IRUGO, sfe_cm_get_defunct_all, sfe_cm_set_defunct_all),
1050};
1051
1052/*
1053 * sfe_cm_init()
1054 */
1055static int __init sfe_cm_init(void)
1056{
1057        struct sfe_cm *sc = &__sc;
1058        int result = -1;
1059        size_t i, j;
1060
1061#ifdef SFE_SUPPORT_IPV6
1062        sfe_ipv6_init();
1063#endif
1064        sfe_ipv4_init();
1065
1066        DEBUG_INFO("SFE CM init\n");
1067
1068        /*
1069         * Create sys/sfe_cm
1070         */
1071        sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL);
1072        if (!sc->sys_sfe_cm) {
1073                DEBUG_ERROR("failed to register sfe_cm\n");
1074                goto exit1;
1075        }
1076
1077        for (i = 0; i < ARRAY_SIZE(sfe_attrs); i++) {
1078                result = sysfs_create_file(sc->sys_sfe_cm, &sfe_attrs[i].attr);
1079                if (result) {
1080                        DEBUG_ERROR("failed to register %s : %d\n",
1081                                    sfe_attrs[i].attr.name, result);
1082                        goto exit2;
1083                }
1084        }
1085
1086        sc->dev_notifier.notifier_call = sfe_cm_device_event;
1087        sc->dev_notifier.priority = 1;
1088        register_netdevice_notifier(&sc->dev_notifier);
1089
1090        sc->inet_notifier.notifier_call = sfe_cm_inet_event;
1091        sc->inet_notifier.priority = 1;
1092        register_inetaddr_notifier(&sc->inet_notifier);
1093#ifdef SFE_SUPPORT_IPV6
1094        if (register_inet6addr_notifier) {
1095                sc->inet6_notifier.notifier_call = sfe_cm_inet6_event;
1096                sc->inet6_notifier.priority = 1;
1097                register_inet6addr_notifier(&sc->inet6_notifier);
1098        }
1099#endif
1100        /*
1101         * Register our netfilter hooks.
1102         */
1103        result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1104        if (result < 0) {
1105                DEBUG_ERROR("can't register nf post routing hook: %d\n", result);
1106                goto exit3;
1107        }
1108
1109#ifdef CONFIG_NF_CONNTRACK_EVENTS
1110        /*
1111         * Register a notifier hook to get fast notifications of expired connections.
1112         */
1113        result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier);
1114        if (result < 0) {
1115                DEBUG_ERROR("can't register nf notifier hook: %d\n", result);
1116                goto exit4;
1117        }
1118#endif
1119
1120        spin_lock_init(&sc->lock);
1121
1122        /*
1123         * Hook the shortcut sync callback.
1124         */
1125        sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule);
1126#ifdef SFE_SUPPORT_IPV6
1127        sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule);
1128#endif
1129        fast_classifier_init();
1130
1131        return 0;
1132
1133#ifdef CONFIG_NF_CONNTRACK_EVENTS
1134exit4:
1135        nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1136#endif
1137exit3:
1138#ifdef SFE_SUPPORT_IPV6
1139        if (unregister_inet6addr_notifier) {
1140                unregister_inet6addr_notifier(&sc->inet6_notifier);
1141        }
1142#endif
1143        unregister_inetaddr_notifier(&sc->inet_notifier);
1144        unregister_netdevice_notifier(&sc->dev_notifier);
1145exit2:
1146        for (j = 0; j < i; j++) {
1147                sysfs_remove_file(sc->sys_sfe_cm, &sfe_attrs[j].attr);
1148        }
1149        kobject_put(sc->sys_sfe_cm);
1150
1151exit1:
1152        sfe_ipv4_exit();
1153#ifdef SFE_SUPPORT_IPV6
1154        sfe_ipv6_exit();
1155#endif
1156
1157        return result;
1158}
1159
1160/*
1161 * sfe_cm_exit()
1162 */
1163static void __exit sfe_cm_exit(void)
1164{
1165        struct sfe_cm *sc = &__sc;
1166
1167        DEBUG_INFO("SFE CM exit\n");
1168        fast_classifier_exit();
1169
1170        /*
1171         * Unregister our sync callback.
1172         */
1173        sfe_ipv4_register_sync_rule_callback(NULL);
1174#ifdef SFE_SUPPORT_IPV6
1175        sfe_ipv6_register_sync_rule_callback(NULL);
1176#endif
1177        /*
1178         * Unregister our receive callback.
1179         */
1180        RCU_INIT_POINTER(fast_nat_recv, NULL);
1181
1182        /*
1183         * Wait for all callbacks to complete.
1184         */
1185        rcu_barrier();
1186
1187        /*
1188         * Destroy all connections.
1189         */
1190        sfe_ipv4_destroy_all_rules_for_dev(NULL);
1191#ifdef SFE_SUPPORT_IPV6
1192        sfe_ipv6_destroy_all_rules_for_dev(NULL);
1193#endif
1194#ifdef CONFIG_NF_CONNTRACK_EVENTS
1195        nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier);
1196
1197#endif
1198        nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1199
1200#ifdef SFE_SUPPORT_IPV6
1201        if (unregister_inet6addr_notifier) {
1202                unregister_inet6addr_notifier(&sc->inet6_notifier);
1203        }
1204#endif
1205        unregister_inetaddr_notifier(&sc->inet_notifier);
1206        unregister_netdevice_notifier(&sc->dev_notifier);
1207
1208        kobject_put(sc->sys_sfe_cm);
1209        sfe_ipv4_exit();
1210#ifdef SFE_SUPPORT_IPV6
1211        sfe_ipv6_exit();
1212#endif
1213}
1214
1215module_init(sfe_cm_init)
1216module_exit(sfe_cm_exit)
1217
1218MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager");
1219MODULE_LICENSE("Dual BSD/GPL");
1220
Note: See TracBrowser for help on using the repository browser.