source: src/linux/universal/linux-3.2/net/shortcut-fe/sfe_cm.c @ 32719

Last change on this file since 32719 was 32719, checked in by brainslayer, 5 weeks ago

backport

File size: 30.4 KB
Line 
1/*
2 * sfe-cm.c
3 *      Shortcut forwarding engine connection manager.
4 *
5 * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved.
6 * Permission to use, copy, modify, and/or distribute this software for
7 * any purpose with or without fee is hereby granted, provided that the
8 * above copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
15 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <linux/module.h>
19#include <linux/sysfs.h>
20#include <linux/skbuff.h>
21#include <net/route.h>
22#include <net/ip6_route.h>
23#include <net/addrconf.h>
24#include <net/dsfield.h>
25#include <linux/inetdevice.h>
26#include <linux/netfilter_bridge.h>
27#include <linux/netfilter_ipv6.h>
28#include <net/netfilter/nf_conntrack_acct.h>
29#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_zones.h>
31#include <net/netfilter/nf_conntrack_core.h>
32#include <linux/netfilter/xt_dscp.h>
33#include <linux/if_bridge.h>
34
35#include "sfe.h"
36#include "sfe_cm.h"
37#include "sfe_backport.h"
38
39#include "sfe_ipv4.c"
40#ifdef SFE_SUPPORT_IPV6
41#include "sfe_ipv6.c"
42#endif
43#include "fast-classifier.c"
44
45typedef enum sfe_cm_exception {
46        SFE_CM_EXCEPTION_PACKET_BROADCAST,
47        SFE_CM_EXCEPTION_PACKET_MULTICAST,
48        SFE_CM_EXCEPTION_NO_IIF,
49        SFE_CM_EXCEPTION_NO_CT,
50        SFE_CM_EXCEPTION_CT_NO_TRACK,
51        SFE_CM_EXCEPTION_CT_NO_CONFIRM,
52        SFE_CM_EXCEPTION_CT_IS_ALG,
53        SFE_CM_EXCEPTION_IS_IPV4_MCAST,
54        SFE_CM_EXCEPTION_IS_IPV6_MCAST,
55        SFE_CM_EXCEPTION_TCP_NOT_ASSURED,
56        SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED,
57        SFE_CM_EXCEPTION_UNKNOW_PROTOCOL,
58        SFE_CM_EXCEPTION_NO_SRC_DEV,
59        SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV,
60        SFE_CM_EXCEPTION_NO_DEST_DEV,
61        SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV,
62        SFE_CM_EXCEPTION_NO_BRIDGE,
63        SFE_CM_EXCEPTION_LOCAL_OUT,
64        SFE_CM_EXCEPTION_MAX
65} sfe_cm_exception_t;
66
67static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = {
68        "PACKET_BROADCAST",
69        "PACKET_MULTICAST",
70        "NO_IIF",
71        "NO_CT",
72        "CT_NO_TRACK",
73        "CT_NO_CONFIRM",
74        "CT_IS_ALG",
75        "IS_IPV4_MCAST",
76        "IS_IPV6_MCAST",
77        "TCP_NOT_ASSURED",
78        "TCP_NOT_ESTABLISHED",
79        "UNKNOW_PROTOCOL",
80        "NO_SRC_DEV",
81        "NO_SRC_XLATE_DEV",
82        "NO_DEST_DEV",
83        "NO_DEST_XLATE_DEV",
84        "NO_BRIDGE",
85        "LOCAL_OUT"
86};
87
88/*
89 * Per-module structure.
90 */
91struct sfe_cm {
92        spinlock_t lock;                /* Lock for SMP correctness */
93
94        /*
95         * Control state.
96         */
97        struct kobject *sys_sfe_cm;     /* sysfs linkage */
98
99        /*
100         * Callback notifiers.
101         */
102        struct notifier_block dev_notifier;     /* Device notifier */
103        struct notifier_block inet_notifier;    /* IPv4 notifier */
104#ifdef SFE_SUPPORT_IPV6
105        struct notifier_block inet6_notifier;   /* IPv6 notifier */
106#endif
107        u32 exceptions[SFE_CM_EXCEPTION_MAX];
108};
109
110static struct sfe_cm __sc;
111
112
113/*
114 * sfe_cm_incr_exceptions()
115 *      increase an exception counter.
116 */
117static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except)
118{
119        struct sfe_cm *sc = &__sc;
120
121        spin_lock_bh(&sc->lock);
122        sc->exceptions[except]++;
123        spin_unlock_bh(&sc->lock);
124}
125
126/*
127 * sfe_cm_recv()
128 *      Handle packet receives.
129 *
130 * Returns 1 if the packet is forwarded or 0 if it isn't.
131 */
132static int sfe_cm_recv(struct sk_buff *skb)
133{
134        struct net_device *dev;
135
136        /*
137         * We know that for the vast majority of packets we need the transport
138         * layer header so we may as well start to fetch it now!
139         */
140        prefetch(skb->data + 32);
141        barrier();
142
143        dev = skb->dev;
144
145        /*
146         * We're only interested in IPv4 and IPv6 packets.
147         */
148        if (likely(htons(ETH_P_IP) == skb->protocol)) {
149                struct in_device *in_dev;
150
151                /*
152                 * Does our input device support IP processing?
153                 */
154                in_dev = (struct in_device *)dev->ip_ptr;
155                if (unlikely(!in_dev)) {
156                        DEBUG_TRACE("no IP processing for device: %s\n", dev->name);
157                        return 0;
158                }
159
160                /*
161                 * Does it have an IP address?  If it doesn't then we can't do anything
162                 * interesting here!
163                 */
164                if (unlikely(!in_dev->ifa_list)) {
165                        DEBUG_TRACE("no IP address for device: %s\n", dev->name);
166                        return 0;
167                }
168
169                return sfe_ipv4_recv(dev, skb);
170        }
171
172#ifdef SFE_SUPPORT_IPV6
173        if (likely(htons(ETH_P_IPV6) == skb->protocol)) {
174                struct inet6_dev *in_dev;
175
176                /*
177                 * Does our input device support IPv6 processing?
178                 */
179                in_dev = (struct inet6_dev *)dev->ip6_ptr;
180                if (unlikely(!in_dev)) {
181                        DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name);
182                        return 0;
183                }
184
185                /*
186                 * Does it have an IPv6 address?  If it doesn't then we can't do anything
187                 * interesting here!
188                 */
189                if (unlikely(list_empty(&in_dev->addr_list))) {
190                        DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name);
191                        return 0;
192                }
193
194                return sfe_ipv6_recv(dev, skb);
195        }
196#endif
197        DEBUG_TRACE("not IP packet\n");
198        return 0;
199}
200
201/*
202 * sfe_cm_find_dev_and_mac_addr()
203 *      Find the device and MAC address for a given IPv4/IPv6 address.
204 *
205 * Returns true if we find the device and MAC address, otherwise false.
206 *
207 * We look up the rtable entry for the address and, from its neighbour
208 * structure, obtain the hardware address.  This means this function also
209 * works if the neighbours are routers too.
210 */
211static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4)
212{
213        struct neighbour *neigh;
214        struct rtable *rt;
215        struct rt6_info *rt6 = NULL;
216        struct dst_entry *dst;
217        struct net_device *mac_dev;
218
219        /*
220         * Look up the rtable entry for the IP address then get the hardware
221         * address from its neighbour structure.  This means this work when the
222         * neighbours are routers too.
223         */
224        if (likely(is_v4)) {
225                rt = ip_route_output(&init_net, addr->ip, 0, 0, 0);
226                if (unlikely(IS_ERR(rt))) {
227                        goto ret_fail;
228                }
229
230                dst = (struct dst_entry *)rt;
231        } else {
232                if (rt6_lookup)
233                    rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0);
234                if (!rt6) {
235                        goto ret_fail;
236                }
237
238                dst = (struct dst_entry *)rt6;
239        }
240
241        rcu_read_lock();
242        neigh = dst_neigh_lookup(dst, addr);
243        if (unlikely(!neigh)) {
244                rcu_read_unlock();
245                dst_release(dst);
246                goto ret_fail;
247        }
248
249        if (unlikely(!(neigh->nud_state & NUD_VALID))) {
250                rcu_read_unlock();
251                neigh_release(neigh);
252                dst_release(dst);
253                goto ret_fail;
254        }
255
256        mac_dev = neigh->dev;
257        if (!mac_dev) {
258                rcu_read_unlock();
259                neigh_release(neigh);
260                dst_release(dst);
261                goto ret_fail;
262        }
263
264        memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len);
265
266        dev_hold(mac_dev);
267        *dev = mac_dev;
268        rcu_read_unlock();
269        neigh_release(neigh);
270        dst_release(dst);
271
272        return true;
273
274ret_fail:
275        if (is_v4) {
276                DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip);
277
278        } else {
279                DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6);
280        }
281
282        return false;
283}
284
285EXPORT_SYMBOL(sfe_cm_find_dev_and_mac_addr);
286/*
287 * sfe_cm_post_routing()
288 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
289 */
290static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4)
291{
292        struct sfe_connection_create sic;
293        struct net_device *in;
294        struct nf_conn *ct;
295        enum ip_conntrack_info ctinfo;
296        struct net_device *dev;
297        struct net_device *src_dev;
298        struct net_device *dest_dev;
299        struct net_device *src_br_dev = NULL;
300        struct net_device *dest_br_dev = NULL;
301        struct nf_conntrack_tuple orig_tuple;
302        struct nf_conntrack_tuple reply_tuple;
303        SFE_NF_CONN_ACCT(acct);
304
305        /*
306         * Don't process broadcast or multicast packets.
307         */
308        if (unlikely(skb->pkt_type == PACKET_BROADCAST)) {
309                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST);
310                DEBUG_TRACE("broadcast, ignoring\n");
311                return NF_ACCEPT;
312        }
313        if (unlikely(skb->pkt_type == PACKET_MULTICAST)) {
314                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST);
315                DEBUG_TRACE("multicast, ignoring\n");
316                return NF_ACCEPT;
317        }
318
319#ifdef CONFIG_XFRM
320        /*
321         * Packet to xfrm for encapsulation, we can't process it
322         */
323        if (unlikely(skb_dst(skb)->xfrm)) {
324                DEBUG_TRACE("packet to xfrm, ignoring\n");
325                return NF_ACCEPT;
326        }
327#endif
328
329        /*
330         * Don't process locally generated packets.
331         */
332        if (skb->sk) {
333                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT);
334                DEBUG_TRACE("skip local out packet\n");
335                return NF_ACCEPT;
336        }
337
338        /*
339         * Don't process packets that are not being forwarded.
340         */
341        in = dev_get_by_index(&init_net, skb->skb_iif);
342        if (!in) {
343                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF);
344                DEBUG_TRACE("packet not forwarding\n");
345                return NF_ACCEPT;
346        }
347
348        dev_put(in);
349
350        /*
351         * Don't process packets that aren't being tracked by conntrack.
352         */
353        ct = nf_ct_get(skb, &ctinfo);
354        if (unlikely(!ct)) {
355                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT);
356                DEBUG_TRACE("no conntrack connection, ignoring\n");
357                return NF_ACCEPT;
358        }
359
360        /*
361         * Don't process untracked connections.
362         */
363        if (unlikely(nf_ct_is_untracked(ct))) {
364                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK);
365                DEBUG_TRACE("untracked connection\n");
366                return NF_ACCEPT;
367        }
368
369        /*
370         * Unconfirmed connection may be dropped by Linux at the final step,
371         * So we don't process unconfirmed connections.
372         */
373        if (!nf_ct_is_confirmed(ct)) {
374                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM);
375                DEBUG_TRACE("unconfirmed connection\n");
376                return NF_ACCEPT;
377        }
378
379        /*
380         * Don't process connections that require support from a 'helper' (typically a NAT ALG).
381         */
382        if (unlikely(nfct_help(ct))) {
383                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG);
384                DEBUG_TRACE("connection has helper\n");
385                return NF_ACCEPT;
386        }
387
388        /*
389         * Check if the acceleration of a flow could be rejected quickly.
390         */
391        acct = nf_conn_acct_find(ct);
392        if (acct) {
393                long long packets = atomic64_read((atomic64_t *)&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets);
394                if ((packets > 0xff) && (packets & 0xff)) {
395                        /*
396                         * Connection hits slow path at least 256 times, so it must be not able to accelerate.
397                         * But we also give it a chance to walk through ECM every 256 packets
398                         */
399                        return NF_ACCEPT;
400                }
401        }
402
403        /*
404         * Look up the details of our connection in conntrack.
405         *
406         * Note that the data we get from conntrack is for the "ORIGINAL" direction
407         * but our packet may actually be in the "REPLY" direction.
408         */
409        orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
410        reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
411        sic.protocol = (s32)orig_tuple.dst.protonum;
412
413        sic.flags = 0;
414
415        /*
416         * Get addressing information, non-NAT first
417         */
418        if (likely(is_v4)) {
419                u32 dscp;
420
421                sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
422                sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
423
424                if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) {
425                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST);
426                        DEBUG_TRACE("multicast address\n");
427                        return NF_ACCEPT;
428                }
429
430                /*
431                 * NAT'ed addresses - note these are as seen from the 'reply' direction
432                 * When NAT does not apply to this connection these will be identical to the above.
433                 */
434                sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip;
435                sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip;
436
437                dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
438                if (dscp) {
439                        sic.dest_dscp = dscp;
440                        sic.src_dscp = sic.dest_dscp;
441                        sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
442                }
443        }
444#ifdef SFE_SUPPORT_IPV6
445        else {
446                u32 dscp;
447
448                sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
449                sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
450
451                if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) ||
452                    ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) {
453                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST);
454                        DEBUG_TRACE("multicast address\n");
455                        return NF_ACCEPT;
456                }
457
458                /*
459                 * NAT'ed addresses - note these are as seen from the 'reply' direction
460                 * When NAT does not apply to this connection these will be identical to the above.
461                 */
462                sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6);
463                sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6);
464
465                dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
466                if (dscp) {
467                        sic.dest_dscp = dscp;
468                        sic.src_dscp = sic.dest_dscp;
469                        sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
470                }
471        }
472#endif
473        switch (sic.protocol) {
474        case IPPROTO_TCP:
475                sic.src_port = orig_tuple.src.u.tcp.port;
476                sic.dest_port = orig_tuple.dst.u.tcp.port;
477                sic.src_port_xlate = reply_tuple.dst.u.tcp.port;
478                sic.dest_port_xlate = reply_tuple.src.u.tcp.port;
479                sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale;
480                sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin;
481                sic.src_td_end = ct->proto.tcp.seen[0].td_end;
482                sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend;
483                sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale;
484                sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin;
485                sic.dest_td_end = ct->proto.tcp.seen[1].td_end;
486                sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend;
487
488                if (nf_ct_tcp_no_window_check
489                    || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL)
490                    || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) {
491                        sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK;
492                }
493
494                /*
495                 * Don't try to manage a non-established connection.
496                 */
497                if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
498                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED);
499                        DEBUG_TRACE("non-established connection\n");
500                        return NF_ACCEPT;
501                }
502
503                /*
504                 * If the connection is shutting down do not manage it.
505                 * state can not be SYN_SENT, SYN_RECV because connection is assured
506                 * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE.
507                 */
508                spin_lock_bh(&ct->lock);
509                if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) {
510                        spin_unlock_bh(&ct->lock);
511                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED);
512                        DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n",
513                                    ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port),
514                                    &sic.dest_ip, ntohs(sic.dest_port));
515                        return NF_ACCEPT;
516                }
517                spin_unlock_bh(&ct->lock);
518                break;
519
520        case IPPROTO_UDP:
521                sic.src_port = orig_tuple.src.u.udp.port;
522                sic.dest_port = orig_tuple.dst.u.udp.port;
523                sic.src_port_xlate = reply_tuple.dst.u.udp.port;
524                sic.dest_port_xlate = reply_tuple.src.u.udp.port;
525                break;
526
527        default:
528                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL);
529                DEBUG_TRACE("unhandled protocol %d\n", sic.protocol);
530                return NF_ACCEPT;
531        }
532
533#ifdef CONFIG_XFRM
534        sic.original_accel = 1;
535        sic.reply_accel = 1;
536
537        /*
538         * For packets de-capsulated from xfrm, we still can accelerate it
539         * on the direction we just received the packet.
540         */
541        if (unlikely(skb->sp)) {
542                if (sic.protocol == IPPROTO_TCP &&
543                    !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) {
544                        return NF_ACCEPT;
545                }
546
547                if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
548                        sic.reply_accel = 0;
549                } else {
550                        sic.original_accel = 0;
551                }
552        }
553#endif
554
555        /*
556         * Get QoS information
557         */
558        if (skb->priority) {
559                sic.dest_priority = skb->priority;
560                sic.src_priority = sic.dest_priority;
561                sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY;
562        }
563
564        /*
565         * Get the net device and MAC addresses that correspond to the various source and
566         * destination host addresses.
567         */
568        if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev, sic.src_mac, is_v4)) {
569                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV);
570                return NF_ACCEPT;
571        }
572
573        if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) {
574                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV);
575                goto done1;
576        }
577
578        dev_put(dev);
579
580        if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) {
581                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV);
582                goto done1;
583        }
584
585        dev_put(dev);
586
587        if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev, sic.dest_mac_xlate, is_v4)) {
588                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV);
589                goto done1;
590        }
591
592        /*
593         * Our devices may actually be part of a bridge interface.  If that's
594         * the case then find the bridge interface instead.
595         */
596        if (src_dev->priv_flags & IFF_BRIDGE_PORT) {
597                src_br_dev = sfe_dev_get_master(src_dev);
598                if (!src_br_dev) {
599                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
600                        DEBUG_TRACE("no bridge found for: %s\n", src_dev->name);
601                        goto done2;
602                }
603
604                src_dev = src_br_dev;
605        }
606
607        if (dest_dev->priv_flags & IFF_BRIDGE_PORT) {
608                dest_br_dev = sfe_dev_get_master(dest_dev);
609                if (!dest_br_dev) {
610                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
611                        DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name);
612                        goto done3;
613                }
614
615                dest_dev = dest_br_dev;
616        }
617
618        sic.src_dev = src_dev;
619        sic.dest_dev = dest_dev;
620
621        sic.src_mtu = src_dev->mtu;
622        sic.dest_mtu = dest_dev->mtu;
623        sic.mark = skb->mark;
624        if (likely(is_v4)) {
625                sfe_ipv4_create_rule(&sic);
626        }
627#ifdef SFE_SUPPORT_IPV6
628        else {
629                sfe_ipv6_create_rule(&sic);
630        }
631#endif
632        /*
633         * If we had bridge ports then release them too.
634         */
635        if (dest_br_dev) {
636                dev_put(dest_br_dev);
637        }
638
639done3:
640        if (src_br_dev) {
641                dev_put(src_br_dev);
642        }
643
644done2:
645        dev_put(dest_dev);
646
647done1:
648        dev_put(src_dev);
649
650        return NF_ACCEPT;
651}
652
653/*
654 * sfe_cm_ipv4_post_routing_hook()
655 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
656 */
657sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
658{
659        return sfe_cm_post_routing(skb, true);
660}
661
662/*
663 * sfe_cm_ipv6_post_routing_hook()
664 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
665 */
666#ifdef SFE_SUPPORT_IPV6
667sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
668{
669        return sfe_cm_post_routing(skb, false);
670}
671#endif
672#ifdef CONFIG_NF_CONNTRACK_EVENTS
673/*
674 * sfe_cm_conntrack_event()
675 *      Callback event invoked when a conntrack connection's state changes.
676 */
677static int sfe_cm_conntrack_event(struct notifier_block *this,
678                                  unsigned long events, void *ptr)
679{
680        struct nf_ct_event *item = ptr;
681        struct sfe_connection_destroy sid;
682        struct nf_conn *ct = item->ct;
683        struct nf_conntrack_tuple orig_tuple;
684
685        /*
686         * If we don't have a conntrack entry then we're done.
687         */
688        if (unlikely(!ct)) {
689                DEBUG_WARN("no ct in conntrack event callback\n");
690                return NOTIFY_DONE;
691        }
692
693        /*
694         * If this is an untracked connection then we can't have any state either.
695         */
696        if (unlikely(nf_ct_is_untracked(ct))) {
697                DEBUG_TRACE("ignoring untracked conn\n");
698                return NOTIFY_DONE;
699        }
700
701        /*
702         * We're only interested in destroy events.
703         */
704        if (unlikely(!(events & (1 << IPCT_DESTROY)))) {
705                DEBUG_TRACE("ignoring non-destroy event\n");
706                return NOTIFY_DONE;
707        }
708
709        orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
710        sid.protocol = (s32)orig_tuple.dst.protonum;
711
712        /*
713         * Extract information from the conntrack connection.  We're only interested
714         * in nominal connection information (i.e. we're ignoring any NAT information).
715         */
716        switch (sid.protocol) {
717        case IPPROTO_TCP:
718                sid.src_port = orig_tuple.src.u.tcp.port;
719                sid.dest_port = orig_tuple.dst.u.tcp.port;
720                break;
721
722        case IPPROTO_UDP:
723                sid.src_port = orig_tuple.src.u.udp.port;
724                sid.dest_port = orig_tuple.dst.u.udp.port;
725                break;
726
727        default:
728                DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol);
729                return NOTIFY_DONE;
730        }
731
732        if (likely(nf_ct_l3num(ct) == AF_INET)) {
733                sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
734                sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
735
736                sfe_ipv4_destroy_rule(&sid);
737        }
738#ifdef SFE_SUPPORT_IPV6
739        else if (likely(nf_ct_l3num(ct) == AF_INET6)) {
740                sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
741                sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
742
743                sfe_ipv6_destroy_rule(&sid);
744        }
745#endif
746        else {
747                DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n");
748        }
749
750        return NOTIFY_DONE;
751}
752
753/*
754 * Netfilter conntrack event system to monitor connection tracking changes
755 */
756static struct notifier_block sfe_cm_conntrack_notifier = {
757        .notifier_call = sfe_cm_conntrack_event,
758};
759#endif
760
761/*
762 * Structure to establish a hook into the post routing netfilter point - this
763 * will pick up local outbound and packets going from one interface to another.
764 *
765 * Note: see include/linux/netfilter_ipv4.h for info related to priority levels.
766 * We want to examine packets after NAT translation and any ALG processing.
767 */
768static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = {
769        SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook),
770#ifdef SFE_SUPPORT_IPV6
771        SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook),
772#endif
773};
774
775/*
776 * sfe_cm_sync_rule()
777 *      Synchronize a connection's state.
778 */
779static void sfe_cm_sync_rule(struct sfe_connection_sync *sis)
780{
781        struct nf_conntrack_tuple_hash *h;
782        struct nf_conntrack_tuple tuple;
783        struct nf_conn *ct;
784        SFE_NF_CONN_ACCT(acct);
785
786        /*
787         * Create a tuple so as to be able to look up a connection
788         */
789        memset(&tuple, 0, sizeof(tuple));
790        tuple.src.u.all = (__be16)sis->src_port;
791        tuple.dst.dir = IP_CT_DIR_ORIGINAL;
792        tuple.dst.protonum = (u8)sis->protocol;
793        tuple.dst.u.all = (__be16)sis->dest_port;
794
795        if (sis->is_v6) {
796                tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6);
797                tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6);
798                tuple.src.l3num = AF_INET6;
799
800                DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n",
801                            (int)tuple.dst.protonum,
802                            &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all),
803                            &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all));
804        } else {
805                tuple.src.u3.ip = sis->src_ip.ip;
806                tuple.dst.u3.ip = sis->dest_ip.ip;
807                tuple.src.l3num = AF_INET;
808
809                DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n",
810                            (int)tuple.dst.protonum,
811                            &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all),
812                            &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all));
813        }
814
815        /*
816         * Look up conntrack connection
817         */
818        h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple);
819        if (unlikely(!h)) {
820                DEBUG_TRACE("no connection found\n");
821                return;
822        }
823
824        ct = nf_ct_tuplehash_to_ctrack(h);
825        NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
826
827        /*
828         * Only update if this is not a fixed timeout
829         */
830        if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
831                spin_lock_bh(&ct->lock);
832                ct->timeout.expires += sis->delta_jiffies;
833                spin_unlock_bh(&ct->lock);
834        }
835
836        acct = nf_conn_acct_find(ct);
837        if (acct) {
838                spin_lock_bh(&ct->lock);
839                atomic64_add(sis->src_new_packet_count, (atomic64_t *)&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets);
840                atomic64_add(sis->src_new_byte_count, (atomic64_t *)&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes);
841                atomic64_add(sis->dest_new_packet_count, (atomic64_t *)&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets);
842                atomic64_add(sis->dest_new_byte_count, (atomic64_t *)&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes);
843                spin_unlock_bh(&ct->lock);
844        }
845
846        switch (sis->protocol) {
847        case IPPROTO_TCP:
848                spin_lock_bh(&ct->lock);
849                if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) {
850                        ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window;
851                }
852                if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) {
853                        ct->proto.tcp.seen[0].td_end = sis->src_td_end;
854                }
855                if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) {
856                        ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end;
857                }
858                if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) {
859                        ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window;
860                }
861                if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) {
862                        ct->proto.tcp.seen[1].td_end = sis->dest_td_end;
863                }
864                if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) {
865                        ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end;
866                }
867                spin_unlock_bh(&ct->lock);
868                break;
869        }
870
871        /*
872         * Release connection
873         */
874        nf_ct_put(ct);
875}
876
877/*
878 * sfe_cm_device_event()
879 */
880static int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr)
881{
882        struct net_device *dev = SFE_DEV_EVENT_PTR(ptr);
883
884        if (dev && (event == NETDEV_DOWN)) {
885                sfe_ipv4_destroy_all_rules_for_dev(dev);
886#ifdef SFE_SUPPORT_IPV6
887                sfe_ipv6_destroy_all_rules_for_dev(dev);
888#endif
889        }
890        return NOTIFY_DONE;
891}
892
893/*
894 * sfe_cm_inet_event()
895 */
896static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr)
897{
898        struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
899
900        if (dev && (event == NETDEV_DOWN)) {
901                sfe_ipv4_destroy_all_rules_for_dev(dev);
902        }
903
904        return NOTIFY_DONE;
905}
906
907#ifdef SFE_SUPPORT_IPV6
908/*
909 * sfe_cm_inet6_event()
910 */
911static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr)
912{
913        struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev;
914
915        if (dev && (event == NETDEV_DOWN)) {
916                sfe_ipv6_destroy_all_rules_for_dev(dev);
917        }
918
919        return NOTIFY_DONE;
920}
921#endif
922/*
923 * sfe_cm_get_exceptions
924 *      dump exception counters
925 */
926static ssize_t sfe_cm_get_exceptions(struct device *dev,
927                                     struct device_attribute *attr,
928                                     char *buf)
929{
930        int idx, len;
931        struct sfe_cm *sc = &__sc;
932
933        spin_lock_bh(&sc->lock);
934        for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) {
935                if (sc->exceptions[idx]) {
936                        len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]);
937                }
938        }
939        spin_unlock_bh(&sc->lock);
940
941        return len;
942}
943
944/*
945 * sfe_cm_get_stop
946 *      dump stop
947 */
948static ssize_t sfe_cm_get_stop(struct device *dev,
949                               struct device_attribute *attr,
950                               char *buf)
951{
952        int (*fast_recv)(struct sk_buff *skb);
953        rcu_read_lock();
954        fast_recv = rcu_dereference(fast_nat_recv);
955        rcu_read_unlock();
956        return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", fast_recv ? 0 : 1);
957}
958
959static ssize_t sfe_cm_set_stop(struct device *dev,
960                               struct device_attribute *attr,
961                               const char *buf, size_t count)
962{
963        int ret;
964        u32 num;
965        int (*fast_recv)(struct sk_buff *skb);
966
967        ret = kstrtou32(buf, 0, &num);
968        if (ret)
969                return ret;
970
971        /*
972         * Hook/Unhook the receive path in the network stack.
973         */
974        if (num) {
975                RCU_INIT_POINTER(fast_nat_recv, NULL);
976        } else {
977                rcu_read_lock();
978                fast_recv = rcu_dereference(fast_nat_recv);
979                rcu_read_unlock();
980                if (!fast_recv) {
981                        BUG_ON(fast_nat_recv);
982                        RCU_INIT_POINTER(fast_nat_recv, sfe_cm_recv);
983                }
984        }
985
986        DEBUG_TRACE("sfe_cm_stop = %d\n", num);
987        return count;
988}
989
990/*
991 * sfe_cm_get_defunct_all
992 *      dump state of SFE
993 */
994static ssize_t sfe_cm_get_defunct_all(struct device *dev,
995                                      struct device_attribute *attr,
996                                      char *buf)
997{
998        return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", 0);
999}
1000
1001static ssize_t sfe_cm_set_defunct_all(struct device *dev,
1002                                      struct device_attribute *attr,
1003                                      const char *buf, size_t count)
1004{
1005        sfe_ipv4_destroy_all_rules_for_dev(NULL);
1006#ifdef SFE_SUPPORT_IPV6
1007        sfe_ipv6_destroy_all_rules_for_dev(NULL);
1008#endif
1009        return count;
1010}
1011
1012/*
1013 * sysfs attributes.
1014 */
1015static const struct device_attribute sfe_attrs[] = {
1016        __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL),
1017        __ATTR(stop, S_IWUSR | S_IRUGO, sfe_cm_get_stop, sfe_cm_set_stop),
1018        __ATTR(defunct_all, S_IWUSR | S_IRUGO, sfe_cm_get_defunct_all, sfe_cm_set_defunct_all),
1019};
1020
1021/*
1022 * sfe_cm_init()
1023 */
1024static int __init sfe_cm_init(void)
1025{
1026        struct sfe_cm *sc = &__sc;
1027        int result = -1;
1028        size_t i, j;
1029
1030#ifdef SFE_SUPPORT_IPV6
1031        sfe_ipv6_init();
1032#endif
1033        sfe_ipv4_init();
1034
1035        DEBUG_INFO("SFE CM init\n");
1036
1037        /*
1038         * Create sys/sfe_cm
1039         */
1040        sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL);
1041        if (!sc->sys_sfe_cm) {
1042                DEBUG_ERROR("failed to register sfe_cm\n");
1043                goto exit1;
1044        }
1045
1046        for (i = 0; i < ARRAY_SIZE(sfe_attrs); i++) {
1047                result = sysfs_create_file(sc->sys_sfe_cm, &sfe_attrs[i].attr);
1048                if (result) {
1049                        DEBUG_ERROR("failed to register %s : %d\n",
1050                                    sfe_attrs[i].attr.name, result);
1051                        goto exit2;
1052                }
1053        }
1054
1055        sc->dev_notifier.notifier_call = sfe_cm_device_event;
1056        sc->dev_notifier.priority = 1;
1057        register_netdevice_notifier(&sc->dev_notifier);
1058
1059        sc->inet_notifier.notifier_call = sfe_cm_inet_event;
1060        sc->inet_notifier.priority = 1;
1061        register_inetaddr_notifier(&sc->inet_notifier);
1062#ifdef SFE_SUPPORT_IPV6
1063        if (register_inet6addr_notifier) {
1064                sc->inet6_notifier.notifier_call = sfe_cm_inet6_event;
1065                sc->inet6_notifier.priority = 1;
1066                register_inet6addr_notifier(&sc->inet6_notifier);
1067        }
1068#endif
1069        /*
1070         * Register our netfilter hooks.
1071         */
1072        result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1073        if (result < 0) {
1074                DEBUG_ERROR("can't register nf post routing hook: %d\n", result);
1075                goto exit3;
1076        }
1077
1078#ifdef CONFIG_NF_CONNTRACK_EVENTS
1079        /*
1080         * Register a notifier hook to get fast notifications of expired connections.
1081         */
1082        result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier);
1083        if (result < 0) {
1084                DEBUG_ERROR("can't register nf notifier hook: %d\n", result);
1085                goto exit4;
1086        }
1087#endif
1088
1089        spin_lock_init(&sc->lock);
1090
1091        /*
1092         * Hook the shortcut sync callback.
1093         */
1094        sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule);
1095#ifdef SFE_SUPPORT_IPV6
1096        sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule);
1097#endif
1098        fast_classifier_init();
1099
1100        return 0;
1101
1102#ifdef CONFIG_NF_CONNTRACK_EVENTS
1103exit4:
1104        nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1105#endif
1106exit3:
1107#ifdef SFE_SUPPORT_IPV6
1108        if (unregister_inet6addr_notifier) {
1109                unregister_inet6addr_notifier(&sc->inet6_notifier);
1110        }
1111#endif
1112        unregister_inetaddr_notifier(&sc->inet_notifier);
1113        unregister_netdevice_notifier(&sc->dev_notifier);
1114exit2:
1115        for (j = 0; j < i; j++) {
1116                sysfs_remove_file(sc->sys_sfe_cm, &sfe_attrs[j].attr);
1117        }
1118        kobject_put(sc->sys_sfe_cm);
1119
1120exit1:
1121        sfe_ipv4_exit();
1122#ifdef SFE_SUPPORT_IPV6
1123        sfe_ipv6_exit();
1124#endif
1125
1126        return result;
1127}
1128
1129/*
1130 * sfe_cm_exit()
1131 */
1132static void __exit sfe_cm_exit(void)
1133{
1134        struct sfe_cm *sc = &__sc;
1135
1136        DEBUG_INFO("SFE CM exit\n");
1137        fast_classifier_exit();
1138
1139        /*
1140         * Unregister our sync callback.
1141         */
1142        sfe_ipv4_register_sync_rule_callback(NULL);
1143#ifdef SFE_SUPPORT_IPV6
1144        sfe_ipv6_register_sync_rule_callback(NULL);
1145#endif
1146        /*
1147         * Unregister our receive callback.
1148         */
1149        RCU_INIT_POINTER(fast_nat_recv, NULL);
1150
1151        /*
1152         * Wait for all callbacks to complete.
1153         */
1154        rcu_barrier();
1155
1156        /*
1157         * Destroy all connections.
1158         */
1159        sfe_ipv4_destroy_all_rules_for_dev(NULL);
1160#ifdef SFE_SUPPORT_IPV6
1161        sfe_ipv6_destroy_all_rules_for_dev(NULL);
1162#endif
1163#ifdef CONFIG_NF_CONNTRACK_EVENTS
1164        nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier);
1165
1166#endif
1167        nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1168
1169#ifdef SFE_SUPPORT_IPV6
1170        if (unregister_inet6addr_notifier) {
1171                unregister_inet6addr_notifier(&sc->inet6_notifier);
1172        }
1173#endif
1174        unregister_inetaddr_notifier(&sc->inet_notifier);
1175        unregister_netdevice_notifier(&sc->dev_notifier);
1176
1177        kobject_put(sc->sys_sfe_cm);
1178        sfe_ipv4_exit();
1179#ifdef SFE_SUPPORT_IPV6
1180        sfe_ipv6_exit();
1181#endif
1182}
1183
1184module_init(sfe_cm_init)
1185module_exit(sfe_cm_exit)
1186
1187MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager");
1188MODULE_LICENSE("Dual BSD/GPL");
1189
Note: See TracBrowser for help on using the repository browser.