source: src/linux/universal/linux-3.2/net/shortcut-fe/sfe_cm.c @ 33046

Last change on this file since 33046 was 33046, checked in by brainslayer, 10 days ago

dissent: sfe: support qos ingress shaping

File size: 30.6 KB
Line 
1/*
2 * sfe-cm.c
3 *      Shortcut forwarding engine connection manager.
4 *
5 * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved.
6 * Permission to use, copy, modify, and/or distribute this software for
7 * any purpose with or without fee is hereby granted, provided that the
8 * above copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
15 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <linux/module.h>
19#include <linux/sysfs.h>
20#include <linux/skbuff.h>
21#include <net/route.h>
22#include <net/ip6_route.h>
23#include <net/addrconf.h>
24#include <net/dsfield.h>
25#include <linux/inetdevice.h>
26#include <linux/netfilter_bridge.h>
27#include <linux/netfilter_ipv6.h>
28#include <net/netfilter/nf_conntrack_acct.h>
29#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_zones.h>
31#include <net/netfilter/nf_conntrack_core.h>
32#include <linux/netfilter/xt_dscp.h>
33#include <linux/if_bridge.h>
34#include <net/pkt_sched.h>
35
36#include "sfe.h"
37#include "sfe_cm.h"
38#include "sfe_backport.h"
39
40#include "sfe_ipv4.c"
41#ifdef SFE_SUPPORT_IPV6
42#include "sfe_ipv6.c"
43#endif
44#include "fast-classifier.c"
45
46typedef enum sfe_cm_exception {
47        SFE_CM_EXCEPTION_PACKET_BROADCAST,
48        SFE_CM_EXCEPTION_PACKET_MULTICAST,
49        SFE_CM_EXCEPTION_NO_IIF,
50        SFE_CM_EXCEPTION_NO_CT,
51        SFE_CM_EXCEPTION_CT_NO_TRACK,
52        SFE_CM_EXCEPTION_CT_NO_CONFIRM,
53        SFE_CM_EXCEPTION_CT_IS_ALG,
54        SFE_CM_EXCEPTION_IS_IPV4_MCAST,
55        SFE_CM_EXCEPTION_IS_IPV6_MCAST,
56        SFE_CM_EXCEPTION_TCP_NOT_ASSURED,
57        SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED,
58        SFE_CM_EXCEPTION_UNKNOW_PROTOCOL,
59        SFE_CM_EXCEPTION_NO_SRC_DEV,
60        SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV,
61        SFE_CM_EXCEPTION_NO_DEST_DEV,
62        SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV,
63        SFE_CM_EXCEPTION_NO_BRIDGE,
64        SFE_CM_EXCEPTION_LOCAL_OUT,
65        SFE_CM_EXCEPTION_MAX
66} sfe_cm_exception_t;
67
68static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = {
69        "PACKET_BROADCAST",
70        "PACKET_MULTICAST",
71        "NO_IIF",
72        "NO_CT",
73        "CT_NO_TRACK",
74        "CT_NO_CONFIRM",
75        "CT_IS_ALG",
76        "IS_IPV4_MCAST",
77        "IS_IPV6_MCAST",
78        "TCP_NOT_ASSURED",
79        "TCP_NOT_ESTABLISHED",
80        "UNKNOW_PROTOCOL",
81        "NO_SRC_DEV",
82        "NO_SRC_XLATE_DEV",
83        "NO_DEST_DEV",
84        "NO_DEST_XLATE_DEV",
85        "NO_BRIDGE",
86        "LOCAL_OUT"
87};
88
89/*
90 * Per-module structure.
91 */
92struct sfe_cm {
93        spinlock_t lock;                /* Lock for SMP correctness */
94
95        /*
96         * Control state.
97         */
98        struct kobject *sys_sfe_cm;     /* sysfs linkage */
99
100        /*
101         * Callback notifiers.
102         */
103        struct notifier_block dev_notifier;     /* Device notifier */
104        struct notifier_block inet_notifier;    /* IPv4 notifier */
105#ifdef SFE_SUPPORT_IPV6
106        struct notifier_block inet6_notifier;   /* IPv6 notifier */
107#endif
108        u32 exceptions[SFE_CM_EXCEPTION_MAX];
109};
110
111static struct sfe_cm __sc;
112
113
114/*
115 * sfe_cm_incr_exceptions()
116 *      increase an exception counter.
117 */
118static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except)
119{
120        struct sfe_cm *sc = &__sc;
121
122        spin_lock_bh(&sc->lock);
123        sc->exceptions[except]++;
124        spin_unlock_bh(&sc->lock);
125}
126
127/*
128 * sfe_cm_recv()
129 *      Handle packet receives.
130 *
131 * Returns 1 if the packet is forwarded or 0 if it isn't.
132 */
133static int sfe_cm_recv(struct sk_buff *skb)
134{
135        struct net_device *dev;
136
137        /*
138         * We know that for the vast majority of packets we need the transport
139         * layer header so we may as well start to fetch it now!
140         */
141        prefetch(skb->data + 32);
142        barrier();
143
144        dev = skb->dev;
145
146#ifdef CONFIG_NET_CLS_ACT
147        /*
148         * If ingress Qdisc configured, and packet not processed by ingress Qdisc yet
149         * We cannot accelerate this packet.
150         */
151        if (dev->ingress_queue && !(skb->tc_verd & TC_NCLS)) {
152                return 0;
153        }
154#endif
155
156        /*
157         * We're only interested in IPv4 and IPv6 packets.
158         */
159        if (likely(htons(ETH_P_IP) == skb->protocol)) {
160                struct in_device *in_dev;
161
162                /*
163                 * Does our input device support IP processing?
164                 */
165                in_dev = (struct in_device *)dev->ip_ptr;
166                if (unlikely(!in_dev)) {
167                        DEBUG_TRACE("no IP processing for device: %s\n", dev->name);
168                        return 0;
169                }
170
171                /*
172                 * Does it have an IP address?  If it doesn't then we can't do anything
173                 * interesting here!
174                 */
175                if (unlikely(!in_dev->ifa_list)) {
176                        DEBUG_TRACE("no IP address for device: %s\n", dev->name);
177                        return 0;
178                }
179
180                return sfe_ipv4_recv(dev, skb);
181        }
182
183#ifdef SFE_SUPPORT_IPV6
184        if (likely(htons(ETH_P_IPV6) == skb->protocol)) {
185                struct inet6_dev *in_dev;
186
187                /*
188                 * Does our input device support IPv6 processing?
189                 */
190                in_dev = (struct inet6_dev *)dev->ip6_ptr;
191                if (unlikely(!in_dev)) {
192                        DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name);
193                        return 0;
194                }
195
196                /*
197                 * Does it have an IPv6 address?  If it doesn't then we can't do anything
198                 * interesting here!
199                 */
200                if (unlikely(list_empty(&in_dev->addr_list))) {
201                        DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name);
202                        return 0;
203                }
204
205                return sfe_ipv6_recv(dev, skb);
206        }
207#endif
208        DEBUG_TRACE("not IP packet\n");
209        return 0;
210}
211
212/*
213 * sfe_cm_find_dev_and_mac_addr()
214 *      Find the device and MAC address for a given IPv4/IPv6 address.
215 *
216 * Returns true if we find the device and MAC address, otherwise false.
217 *
218 * We look up the rtable entry for the address and, from its neighbour
219 * structure, obtain the hardware address.  This means this function also
220 * works if the neighbours are routers too.
221 */
222static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4)
223{
224        struct neighbour *neigh;
225        struct rtable *rt;
226        struct rt6_info *rt6 = NULL;
227        struct dst_entry *dst;
228        struct net_device *mac_dev;
229
230        /*
231         * Look up the rtable entry for the IP address then get the hardware
232         * address from its neighbour structure.  This means this work when the
233         * neighbours are routers too.
234         */
235        if (likely(is_v4)) {
236                rt = ip_route_output(&init_net, addr->ip, 0, 0, 0);
237                if (unlikely(IS_ERR(rt))) {
238                        goto ret_fail;
239                }
240
241                dst = (struct dst_entry *)rt;
242        } else {
243                if (rt6_lookup)
244                    rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0);
245                if (!rt6) {
246                        goto ret_fail;
247                }
248
249                dst = (struct dst_entry *)rt6;
250        }
251
252        rcu_read_lock();
253        neigh = dst_neigh_lookup(dst, addr);
254        if (unlikely(!neigh)) {
255                rcu_read_unlock();
256                dst_release(dst);
257                goto ret_fail;
258        }
259
260        if (unlikely(!(neigh->nud_state & NUD_VALID))) {
261                rcu_read_unlock();
262                neigh_release(neigh);
263                dst_release(dst);
264                goto ret_fail;
265        }
266
267        mac_dev = neigh->dev;
268        if (!mac_dev) {
269                rcu_read_unlock();
270                neigh_release(neigh);
271                dst_release(dst);
272                goto ret_fail;
273        }
274
275        memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len);
276
277        dev_hold(mac_dev);
278        *dev = mac_dev;
279        rcu_read_unlock();
280        neigh_release(neigh);
281        dst_release(dst);
282
283        return true;
284
285ret_fail:
286        if (is_v4) {
287                DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip);
288
289        } else {
290                DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6);
291        }
292
293        return false;
294}
295
296EXPORT_SYMBOL(sfe_cm_find_dev_and_mac_addr);
297/*
298 * sfe_cm_post_routing()
299 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
300 */
301static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4)
302{
303        struct sfe_connection_create sic;
304        struct net_device *in;
305        struct nf_conn *ct;
306        enum ip_conntrack_info ctinfo;
307        struct net_device *dev;
308        struct net_device *src_dev;
309        struct net_device *dest_dev;
310        struct net_device *src_br_dev = NULL;
311        struct net_device *dest_br_dev = NULL;
312        struct nf_conntrack_tuple orig_tuple;
313        struct nf_conntrack_tuple reply_tuple;
314        SFE_NF_CONN_ACCT(acct);
315
316        /*
317         * Don't process broadcast or multicast packets.
318         */
319        if (unlikely(skb->pkt_type == PACKET_BROADCAST)) {
320                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST);
321                DEBUG_TRACE("broadcast, ignoring\n");
322                return NF_ACCEPT;
323        }
324        if (unlikely(skb->pkt_type == PACKET_MULTICAST)) {
325                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST);
326                DEBUG_TRACE("multicast, ignoring\n");
327                return NF_ACCEPT;
328        }
329
330#ifdef CONFIG_XFRM
331        /*
332         * Packet to xfrm for encapsulation, we can't process it
333         */
334        if (unlikely(skb_dst(skb)->xfrm)) {
335                DEBUG_TRACE("packet to xfrm, ignoring\n");
336                return NF_ACCEPT;
337        }
338#endif
339
340        /*
341         * Don't process locally generated packets.
342         */
343        if (skb->sk) {
344                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT);
345                DEBUG_TRACE("skip local out packet\n");
346                return NF_ACCEPT;
347        }
348
349        /*
350         * Don't process packets that are not being forwarded.
351         */
352        in = dev_get_by_index(&init_net, skb->skb_iif);
353        if (!in) {
354                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF);
355                DEBUG_TRACE("packet not forwarding\n");
356                return NF_ACCEPT;
357        }
358
359        dev_put(in);
360
361        /*
362         * Don't process packets that aren't being tracked by conntrack.
363         */
364        ct = nf_ct_get(skb, &ctinfo);
365        if (unlikely(!ct)) {
366                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT);
367                DEBUG_TRACE("no conntrack connection, ignoring\n");
368                return NF_ACCEPT;
369        }
370
371        /*
372         * Don't process untracked connections.
373         */
374        if (unlikely(nf_ct_is_untracked(ct))) {
375                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK);
376                DEBUG_TRACE("untracked connection\n");
377                return NF_ACCEPT;
378        }
379
380        /*
381         * Unconfirmed connection may be dropped by Linux at the final step,
382         * So we don't process unconfirmed connections.
383         */
384        if (!nf_ct_is_confirmed(ct)) {
385                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM);
386                DEBUG_TRACE("unconfirmed connection\n");
387                return NF_ACCEPT;
388        }
389
390        /*
391         * Don't process connections that require support from a 'helper' (typically a NAT ALG).
392         */
393        if (unlikely(nfct_help(ct))) {
394                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG);
395                DEBUG_TRACE("connection has helper\n");
396                return NF_ACCEPT;
397        }
398
399        /*
400         * Check if the acceleration of a flow could be rejected quickly.
401         */
402        acct = nf_conn_acct_find(ct);
403        if (acct) {
404                long long packets = atomic64_read((atomic64_t *)&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets);
405                if ((packets > 0xff) && (packets & 0xff)) {
406                        /*
407                         * Connection hits slow path at least 256 times, so it must be not able to accelerate.
408                         * But we also give it a chance to walk through ECM every 256 packets
409                         */
410                        return NF_ACCEPT;
411                }
412        }
413
414        /*
415         * Look up the details of our connection in conntrack.
416         *
417         * Note that the data we get from conntrack is for the "ORIGINAL" direction
418         * but our packet may actually be in the "REPLY" direction.
419         */
420        orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
421        reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
422        sic.protocol = (s32)orig_tuple.dst.protonum;
423
424        sic.flags = 0;
425
426        /*
427         * Get addressing information, non-NAT first
428         */
429        if (likely(is_v4)) {
430                u32 dscp;
431
432                sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
433                sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
434
435                if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) {
436                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST);
437                        DEBUG_TRACE("multicast address\n");
438                        return NF_ACCEPT;
439                }
440
441                /*
442                 * NAT'ed addresses - note these are as seen from the 'reply' direction
443                 * When NAT does not apply to this connection these will be identical to the above.
444                 */
445                sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip;
446                sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip;
447
448                dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
449                if (dscp) {
450                        sic.dest_dscp = dscp;
451                        sic.src_dscp = sic.dest_dscp;
452                        sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
453                }
454        }
455#ifdef SFE_SUPPORT_IPV6
456        else {
457                u32 dscp;
458
459                sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
460                sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
461
462                if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) ||
463                    ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) {
464                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST);
465                        DEBUG_TRACE("multicast address\n");
466                        return NF_ACCEPT;
467                }
468
469                /*
470                 * NAT'ed addresses - note these are as seen from the 'reply' direction
471                 * When NAT does not apply to this connection these will be identical to the above.
472                 */
473                sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6);
474                sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6);
475
476                dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
477                if (dscp) {
478                        sic.dest_dscp = dscp;
479                        sic.src_dscp = sic.dest_dscp;
480                        sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
481                }
482        }
483#endif
484        switch (sic.protocol) {
485        case IPPROTO_TCP:
486                sic.src_port = orig_tuple.src.u.tcp.port;
487                sic.dest_port = orig_tuple.dst.u.tcp.port;
488                sic.src_port_xlate = reply_tuple.dst.u.tcp.port;
489                sic.dest_port_xlate = reply_tuple.src.u.tcp.port;
490                sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale;
491                sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin;
492                sic.src_td_end = ct->proto.tcp.seen[0].td_end;
493                sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend;
494                sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale;
495                sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin;
496                sic.dest_td_end = ct->proto.tcp.seen[1].td_end;
497                sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend;
498
499                if (nf_ct_tcp_no_window_check
500                    || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL)
501                    || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) {
502                        sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK;
503                }
504
505                /*
506                 * Don't try to manage a non-established connection.
507                 */
508                if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
509                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED);
510                        DEBUG_TRACE("non-established connection\n");
511                        return NF_ACCEPT;
512                }
513
514                /*
515                 * If the connection is shutting down do not manage it.
516                 * state can not be SYN_SENT, SYN_RECV because connection is assured
517                 * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE.
518                 */
519                spin_lock_bh(&ct->lock);
520                if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) {
521                        spin_unlock_bh(&ct->lock);
522                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED);
523                        DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n",
524                                    ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port),
525                                    &sic.dest_ip, ntohs(sic.dest_port));
526                        return NF_ACCEPT;
527                }
528                spin_unlock_bh(&ct->lock);
529                break;
530
531        case IPPROTO_UDP:
532                sic.src_port = orig_tuple.src.u.udp.port;
533                sic.dest_port = orig_tuple.dst.u.udp.port;
534                sic.src_port_xlate = reply_tuple.dst.u.udp.port;
535                sic.dest_port_xlate = reply_tuple.src.u.udp.port;
536                break;
537
538        default:
539                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL);
540                DEBUG_TRACE("unhandled protocol %d\n", sic.protocol);
541                return NF_ACCEPT;
542        }
543
544#ifdef CONFIG_XFRM
545        sic.original_accel = 1;
546        sic.reply_accel = 1;
547
548        /*
549         * For packets de-capsulated from xfrm, we still can accelerate it
550         * on the direction we just received the packet.
551         */
552        if (unlikely(skb->sp)) {
553                if (sic.protocol == IPPROTO_TCP &&
554                    !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) {
555                        return NF_ACCEPT;
556                }
557
558                if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
559                        sic.reply_accel = 0;
560                } else {
561                        sic.original_accel = 0;
562                }
563        }
564#endif
565
566        /*
567         * Get QoS information
568         */
569        if (skb->priority) {
570                sic.dest_priority = skb->priority;
571                sic.src_priority = sic.dest_priority;
572                sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY;
573        }
574
575        /*
576         * Get the net device and MAC addresses that correspond to the various source and
577         * destination host addresses.
578         */
579        if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev, sic.src_mac, is_v4)) {
580                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV);
581                return NF_ACCEPT;
582        }
583
584        if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) {
585                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV);
586                goto done1;
587        }
588
589        dev_put(dev);
590
591        if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) {
592                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV);
593                goto done1;
594        }
595
596        dev_put(dev);
597
598        if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev, sic.dest_mac_xlate, is_v4)) {
599                sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV);
600                goto done1;
601        }
602
603        /*
604         * Our devices may actually be part of a bridge interface.  If that's
605         * the case then find the bridge interface instead.
606         */
607        if (src_dev->priv_flags & IFF_BRIDGE_PORT) {
608                src_br_dev = sfe_dev_get_master(src_dev);
609                if (!src_br_dev) {
610                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
611                        DEBUG_TRACE("no bridge found for: %s\n", src_dev->name);
612                        goto done2;
613                }
614
615                src_dev = src_br_dev;
616        }
617
618        if (dest_dev->priv_flags & IFF_BRIDGE_PORT) {
619                dest_br_dev = sfe_dev_get_master(dest_dev);
620                if (!dest_br_dev) {
621                        sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
622                        DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name);
623                        goto done3;
624                }
625
626                dest_dev = dest_br_dev;
627        }
628
629        sic.src_dev = src_dev;
630        sic.dest_dev = dest_dev;
631
632        sic.src_mtu = src_dev->mtu;
633        sic.dest_mtu = dest_dev->mtu;
634        sic.mark = skb->mark;
635        if (likely(is_v4)) {
636                sfe_ipv4_create_rule(&sic);
637        }
638#ifdef SFE_SUPPORT_IPV6
639        else {
640                sfe_ipv6_create_rule(&sic);
641        }
642#endif
643        /*
644         * If we had bridge ports then release them too.
645         */
646        if (dest_br_dev) {
647                dev_put(dest_br_dev);
648        }
649
650done3:
651        if (src_br_dev) {
652                dev_put(src_br_dev);
653        }
654
655done2:
656        dev_put(dest_dev);
657
658done1:
659        dev_put(src_dev);
660
661        return NF_ACCEPT;
662}
663
664/*
665 * sfe_cm_ipv4_post_routing_hook()
666 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
667 */
668sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
669{
670        return sfe_cm_post_routing(skb, true);
671}
672
673/*
674 * sfe_cm_ipv6_post_routing_hook()
675 *      Called for packets about to leave the box - either locally generated or forwarded from another interface
676 */
677#ifdef SFE_SUPPORT_IPV6
678sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
679{
680        return sfe_cm_post_routing(skb, false);
681}
682#endif
683#ifdef CONFIG_NF_CONNTRACK_EVENTS
684/*
685 * sfe_cm_conntrack_event()
686 *      Callback event invoked when a conntrack connection's state changes.
687 */
688static int sfe_cm_conntrack_event(struct notifier_block *this,
689                                  unsigned long events, void *ptr)
690{
691        struct nf_ct_event *item = ptr;
692        struct sfe_connection_destroy sid;
693        struct nf_conn *ct = item->ct;
694        struct nf_conntrack_tuple orig_tuple;
695
696        /*
697         * If we don't have a conntrack entry then we're done.
698         */
699        if (unlikely(!ct)) {
700                DEBUG_WARN("no ct in conntrack event callback\n");
701                return NOTIFY_DONE;
702        }
703
704        /*
705         * If this is an untracked connection then we can't have any state either.
706         */
707        if (unlikely(nf_ct_is_untracked(ct))) {
708                DEBUG_TRACE("ignoring untracked conn\n");
709                return NOTIFY_DONE;
710        }
711
712        /*
713         * We're only interested in destroy events.
714         */
715        if (unlikely(!(events & (1 << IPCT_DESTROY)))) {
716                DEBUG_TRACE("ignoring non-destroy event\n");
717                return NOTIFY_DONE;
718        }
719
720        orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
721        sid.protocol = (s32)orig_tuple.dst.protonum;
722
723        /*
724         * Extract information from the conntrack connection.  We're only interested
725         * in nominal connection information (i.e. we're ignoring any NAT information).
726         */
727        switch (sid.protocol) {
728        case IPPROTO_TCP:
729                sid.src_port = orig_tuple.src.u.tcp.port;
730                sid.dest_port = orig_tuple.dst.u.tcp.port;
731                break;
732
733        case IPPROTO_UDP:
734                sid.src_port = orig_tuple.src.u.udp.port;
735                sid.dest_port = orig_tuple.dst.u.udp.port;
736                break;
737
738        default:
739                DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol);
740                return NOTIFY_DONE;
741        }
742
743        if (likely(nf_ct_l3num(ct) == AF_INET)) {
744                sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
745                sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
746
747                sfe_ipv4_destroy_rule(&sid);
748        }
749#ifdef SFE_SUPPORT_IPV6
750        else if (likely(nf_ct_l3num(ct) == AF_INET6)) {
751                sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
752                sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
753
754                sfe_ipv6_destroy_rule(&sid);
755        }
756#endif
757        else {
758                DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n");
759        }
760
761        return NOTIFY_DONE;
762}
763
764/*
765 * Netfilter conntrack event system to monitor connection tracking changes
766 */
767static struct notifier_block sfe_cm_conntrack_notifier = {
768        .notifier_call = sfe_cm_conntrack_event,
769};
770#endif
771
772/*
773 * Structure to establish a hook into the post routing netfilter point - this
774 * will pick up local outbound and packets going from one interface to another.
775 *
776 * Note: see include/linux/netfilter_ipv4.h for info related to priority levels.
777 * We want to examine packets after NAT translation and any ALG processing.
778 */
779static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = {
780        SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook),
781#ifdef SFE_SUPPORT_IPV6
782        SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook),
783#endif
784};
785
786/*
787 * sfe_cm_sync_rule()
788 *      Synchronize a connection's state.
789 */
790static void sfe_cm_sync_rule(struct sfe_connection_sync *sis)
791{
792        struct nf_conntrack_tuple_hash *h;
793        struct nf_conntrack_tuple tuple;
794        struct nf_conn *ct;
795        SFE_NF_CONN_ACCT(acct);
796
797        /*
798         * Create a tuple so as to be able to look up a connection
799         */
800        memset(&tuple, 0, sizeof(tuple));
801        tuple.src.u.all = (__be16)sis->src_port;
802        tuple.dst.dir = IP_CT_DIR_ORIGINAL;
803        tuple.dst.protonum = (u8)sis->protocol;
804        tuple.dst.u.all = (__be16)sis->dest_port;
805
806        if (sis->is_v6) {
807                tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6);
808                tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6);
809                tuple.src.l3num = AF_INET6;
810
811                DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n",
812                            (int)tuple.dst.protonum,
813                            &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all),
814                            &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all));
815        } else {
816                tuple.src.u3.ip = sis->src_ip.ip;
817                tuple.dst.u3.ip = sis->dest_ip.ip;
818                tuple.src.l3num = AF_INET;
819
820                DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n",
821                            (int)tuple.dst.protonum,
822                            &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all),
823                            &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all));
824        }
825
826        /*
827         * Look up conntrack connection
828         */
829        h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple);
830        if (unlikely(!h)) {
831                DEBUG_TRACE("no connection found\n");
832                return;
833        }
834
835        ct = nf_ct_tuplehash_to_ctrack(h);
836        NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
837
838        /*
839         * Only update if this is not a fixed timeout
840         */
841        if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
842                spin_lock_bh(&ct->lock);
843                ct->timeout.expires += sis->delta_jiffies;
844                spin_unlock_bh(&ct->lock);
845        }
846
847        acct = nf_conn_acct_find(ct);
848        if (acct) {
849                spin_lock_bh(&ct->lock);
850                atomic64_add(sis->src_new_packet_count, (atomic64_t *)&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets);
851                atomic64_add(sis->src_new_byte_count, (atomic64_t *)&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes);
852                atomic64_add(sis->dest_new_packet_count, (atomic64_t *)&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets);
853                atomic64_add(sis->dest_new_byte_count, (atomic64_t *)&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes);
854                spin_unlock_bh(&ct->lock);
855        }
856
857        switch (sis->protocol) {
858        case IPPROTO_TCP:
859                spin_lock_bh(&ct->lock);
860                if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) {
861                        ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window;
862                }
863                if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) {
864                        ct->proto.tcp.seen[0].td_end = sis->src_td_end;
865                }
866                if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) {
867                        ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end;
868                }
869                if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) {
870                        ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window;
871                }
872                if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) {
873                        ct->proto.tcp.seen[1].td_end = sis->dest_td_end;
874                }
875                if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) {
876                        ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end;
877                }
878                spin_unlock_bh(&ct->lock);
879                break;
880        }
881
882        /*
883         * Release connection
884         */
885        nf_ct_put(ct);
886}
887
888/*
889 * sfe_cm_device_event()
890 */
891static int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr)
892{
893        struct net_device *dev = SFE_DEV_EVENT_PTR(ptr);
894
895        if (dev && (event == NETDEV_DOWN)) {
896                sfe_ipv4_destroy_all_rules_for_dev(dev);
897#ifdef SFE_SUPPORT_IPV6
898                sfe_ipv6_destroy_all_rules_for_dev(dev);
899#endif
900        }
901        return NOTIFY_DONE;
902}
903
904/*
905 * sfe_cm_inet_event()
906 */
907static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr)
908{
909        struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
910
911        if (dev && (event == NETDEV_DOWN)) {
912                sfe_ipv4_destroy_all_rules_for_dev(dev);
913        }
914
915        return NOTIFY_DONE;
916}
917
918#ifdef SFE_SUPPORT_IPV6
919/*
920 * sfe_cm_inet6_event()
921 */
922static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr)
923{
924        struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev;
925
926        if (dev && (event == NETDEV_DOWN)) {
927                sfe_ipv6_destroy_all_rules_for_dev(dev);
928        }
929
930        return NOTIFY_DONE;
931}
932#endif
933/*
934 * sfe_cm_get_exceptions
935 *      dump exception counters
936 */
937static ssize_t sfe_cm_get_exceptions(struct device *dev,
938                                     struct device_attribute *attr,
939                                     char *buf)
940{
941        int idx, len;
942        struct sfe_cm *sc = &__sc;
943
944        spin_lock_bh(&sc->lock);
945        for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) {
946                if (sc->exceptions[idx]) {
947                        len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]);
948                }
949        }
950        spin_unlock_bh(&sc->lock);
951
952        return len;
953}
954
955/*
956 * sfe_cm_get_stop
957 *      dump stop
958 */
959static ssize_t sfe_cm_get_stop(struct device *dev,
960                               struct device_attribute *attr,
961                               char *buf)
962{
963        int (*fast_recv)(struct sk_buff *skb);
964        rcu_read_lock();
965        fast_recv = rcu_dereference(fast_nat_recv);
966        rcu_read_unlock();
967        return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", fast_recv ? 0 : 1);
968}
969
970static ssize_t sfe_cm_set_stop(struct device *dev,
971                               struct device_attribute *attr,
972                               const char *buf, size_t count)
973{
974        int ret;
975        u32 num;
976        int (*fast_recv)(struct sk_buff *skb);
977
978        ret = kstrtou32(buf, 0, &num);
979        if (ret)
980                return ret;
981
982        /*
983         * Hook/Unhook the receive path in the network stack.
984         */
985        if (num) {
986                RCU_INIT_POINTER(fast_nat_recv, NULL);
987        } else {
988                rcu_read_lock();
989                fast_recv = rcu_dereference(fast_nat_recv);
990                rcu_read_unlock();
991                if (!fast_recv) {
992                        BUG_ON(fast_nat_recv);
993                        RCU_INIT_POINTER(fast_nat_recv, sfe_cm_recv);
994                }
995        }
996
997        DEBUG_TRACE("sfe_cm_stop = %d\n", num);
998        return count;
999}
1000
1001/*
1002 * sfe_cm_get_defunct_all
1003 *      dump state of SFE
1004 */
1005static ssize_t sfe_cm_get_defunct_all(struct device *dev,
1006                                      struct device_attribute *attr,
1007                                      char *buf)
1008{
1009        return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", 0);
1010}
1011
1012static ssize_t sfe_cm_set_defunct_all(struct device *dev,
1013                                      struct device_attribute *attr,
1014                                      const char *buf, size_t count)
1015{
1016        sfe_ipv4_destroy_all_rules_for_dev(NULL);
1017#ifdef SFE_SUPPORT_IPV6
1018        sfe_ipv6_destroy_all_rules_for_dev(NULL);
1019#endif
1020        return count;
1021}
1022
1023/*
1024 * sysfs attributes.
1025 */
1026static const struct device_attribute sfe_attrs[] = {
1027        __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL),
1028        __ATTR(stop, S_IWUSR | S_IRUGO, sfe_cm_get_stop, sfe_cm_set_stop),
1029        __ATTR(defunct_all, S_IWUSR | S_IRUGO, sfe_cm_get_defunct_all, sfe_cm_set_defunct_all),
1030};
1031
1032/*
1033 * sfe_cm_init()
1034 */
1035static int __init sfe_cm_init(void)
1036{
1037        struct sfe_cm *sc = &__sc;
1038        int result = -1;
1039        size_t i, j;
1040
1041#ifdef SFE_SUPPORT_IPV6
1042        sfe_ipv6_init();
1043#endif
1044        sfe_ipv4_init();
1045
1046        DEBUG_INFO("SFE CM init\n");
1047
1048        /*
1049         * Create sys/sfe_cm
1050         */
1051        sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL);
1052        if (!sc->sys_sfe_cm) {
1053                DEBUG_ERROR("failed to register sfe_cm\n");
1054                goto exit1;
1055        }
1056
1057        for (i = 0; i < ARRAY_SIZE(sfe_attrs); i++) {
1058                result = sysfs_create_file(sc->sys_sfe_cm, &sfe_attrs[i].attr);
1059                if (result) {
1060                        DEBUG_ERROR("failed to register %s : %d\n",
1061                                    sfe_attrs[i].attr.name, result);
1062                        goto exit2;
1063                }
1064        }
1065
1066        sc->dev_notifier.notifier_call = sfe_cm_device_event;
1067        sc->dev_notifier.priority = 1;
1068        register_netdevice_notifier(&sc->dev_notifier);
1069
1070        sc->inet_notifier.notifier_call = sfe_cm_inet_event;
1071        sc->inet_notifier.priority = 1;
1072        register_inetaddr_notifier(&sc->inet_notifier);
1073#ifdef SFE_SUPPORT_IPV6
1074        if (register_inet6addr_notifier) {
1075                sc->inet6_notifier.notifier_call = sfe_cm_inet6_event;
1076                sc->inet6_notifier.priority = 1;
1077                register_inet6addr_notifier(&sc->inet6_notifier);
1078        }
1079#endif
1080        /*
1081         * Register our netfilter hooks.
1082         */
1083        result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1084        if (result < 0) {
1085                DEBUG_ERROR("can't register nf post routing hook: %d\n", result);
1086                goto exit3;
1087        }
1088
1089#ifdef CONFIG_NF_CONNTRACK_EVENTS
1090        /*
1091         * Register a notifier hook to get fast notifications of expired connections.
1092         */
1093        result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier);
1094        if (result < 0) {
1095                DEBUG_ERROR("can't register nf notifier hook: %d\n", result);
1096                goto exit4;
1097        }
1098#endif
1099
1100        spin_lock_init(&sc->lock);
1101
1102        /*
1103         * Hook the shortcut sync callback.
1104         */
1105        sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule);
1106#ifdef SFE_SUPPORT_IPV6
1107        sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule);
1108#endif
1109        fast_classifier_init();
1110
1111        return 0;
1112
1113#ifdef CONFIG_NF_CONNTRACK_EVENTS
1114exit4:
1115        nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1116#endif
1117exit3:
1118#ifdef SFE_SUPPORT_IPV6
1119        if (unregister_inet6addr_notifier) {
1120                unregister_inet6addr_notifier(&sc->inet6_notifier);
1121        }
1122#endif
1123        unregister_inetaddr_notifier(&sc->inet_notifier);
1124        unregister_netdevice_notifier(&sc->dev_notifier);
1125exit2:
1126        for (j = 0; j < i; j++) {
1127                sysfs_remove_file(sc->sys_sfe_cm, &sfe_attrs[j].attr);
1128        }
1129        kobject_put(sc->sys_sfe_cm);
1130
1131exit1:
1132        sfe_ipv4_exit();
1133#ifdef SFE_SUPPORT_IPV6
1134        sfe_ipv6_exit();
1135#endif
1136
1137        return result;
1138}
1139
1140/*
1141 * sfe_cm_exit()
1142 */
1143static void __exit sfe_cm_exit(void)
1144{
1145        struct sfe_cm *sc = &__sc;
1146
1147        DEBUG_INFO("SFE CM exit\n");
1148        fast_classifier_exit();
1149
1150        /*
1151         * Unregister our sync callback.
1152         */
1153        sfe_ipv4_register_sync_rule_callback(NULL);
1154#ifdef SFE_SUPPORT_IPV6
1155        sfe_ipv6_register_sync_rule_callback(NULL);
1156#endif
1157        /*
1158         * Unregister our receive callback.
1159         */
1160        RCU_INIT_POINTER(fast_nat_recv, NULL);
1161
1162        /*
1163         * Wait for all callbacks to complete.
1164         */
1165        rcu_barrier();
1166
1167        /*
1168         * Destroy all connections.
1169         */
1170        sfe_ipv4_destroy_all_rules_for_dev(NULL);
1171#ifdef SFE_SUPPORT_IPV6
1172        sfe_ipv6_destroy_all_rules_for_dev(NULL);
1173#endif
1174#ifdef CONFIG_NF_CONNTRACK_EVENTS
1175        nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier);
1176
1177#endif
1178        nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1179
1180#ifdef SFE_SUPPORT_IPV6
1181        if (unregister_inet6addr_notifier) {
1182                unregister_inet6addr_notifier(&sc->inet6_notifier);
1183        }
1184#endif
1185        unregister_inetaddr_notifier(&sc->inet_notifier);
1186        unregister_netdevice_notifier(&sc->dev_notifier);
1187
1188        kobject_put(sc->sys_sfe_cm);
1189        sfe_ipv4_exit();
1190#ifdef SFE_SUPPORT_IPV6
1191        sfe_ipv6_exit();
1192#endif
1193}
1194
1195module_init(sfe_cm_init)
1196module_exit(sfe_cm_exit)
1197
1198MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager");
1199MODULE_LICENSE("Dual BSD/GPL");
1200
Note: See TracBrowser for help on using the repository browser.