source: src/linux/universal/linux-3.18/net/core/dev.c @ 31869

Last change on this file since 31869 was 31869, checked in by brainslayer, 2 months ago

update

File size: 183.7 KB
Line 
1/*
2 *      NET3    Protocol independent device support routines.
3 *
4 *              This program is free software; you can redistribute it and/or
5 *              modify it under the terms of the GNU General Public License
6 *              as published by the Free Software Foundation; either version
7 *              2 of the License, or (at your option) any later version.
8 *
9 *      Derived from the non IP parts of dev.c 1.0.19
10 *              Authors:        Ross Biro
11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 *      Additional Authors:
15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
17 *              David Hinds <dahinds@users.sourceforge.net>
18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 *              Adam Sulmicki <adam@cfar.umd.edu>
20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 *      Changes:
23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24 *                                      to 2 if register_netdev gets called
25 *                                      before net_dev_init & also removed a
26 *                                      few lines of code in the process.
27 *              Alan Cox        :       device private ioctl copies fields back.
28 *              Alan Cox        :       Transmit queue code does relevant
29 *                                      stunts to keep the queue safe.
30 *              Alan Cox        :       Fixed double lock.
31 *              Alan Cox        :       Fixed promisc NULL pointer trap
32 *              ????????        :       Support the full private ioctl range
33 *              Alan Cox        :       Moved ioctl permission check into
34 *                                      drivers
35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36 *              Alan Cox        :       100 backlog just doesn't cut it when
37 *                                      you start doing multicast video 8)
38 *              Alan Cox        :       Rewrote net_bh and list manager.
39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40 *              Alan Cox        :       Took out transmit every packet pass
41 *                                      Saved a few bytes in the ioctl handler
42 *              Alan Cox        :       Network driver sets packet type before
43 *                                      calling netif_rx. Saves a function
44 *                                      call a packet.
45 *              Alan Cox        :       Hashed net_bh()
46 *              Richard Kooijman:       Timestamp fixes.
47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48 *              Alan Cox        :       Device lock protection.
49 *              Alan Cox        :       Fixed nasty side effect of device close
50 *                                      changes.
51 *              Rudi Cilibrasi  :       Pass the right thing to
52 *                                      set_mac_address()
53 *              Dave Miller     :       32bit quantity for the device lock to
54 *                                      make it work out on a Sparc.
55 *              Bjorn Ekwall    :       Added KERNELD hack.
56 *              Alan Cox        :       Cleaned up the backlog initialise.
57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
58 *                                      1 device.
59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60 *                                      is no device open function.
61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63 *              Cyrus Durgin    :       Cleaned for KMOD
64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65 *                                      A network device unload needs to purge
66 *                                      the backlog queue.
67 *      Paul Rusty Russell      :       SIOCSIFNAME
68 *              Pekka Riikonen  :       Netdev boot-time settings code
69 *              Andrew Morton   :       Make unregister_netdevice wait
70 *                                      indefinitely on dev->refcnt
71 *              J Hadi Salim    :       - Backlog queue sampling
72 *                                      - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <linux/bitops.h>
77#include <linux/capability.h>
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/hash.h>
82#include <linux/slab.h>
83#include <linux/sched.h>
84#include <linux/mutex.h>
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
94#include <linux/ethtool.h>
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
101#include <linux/imq.h>
102#endif
103#include <linux/stat.h>
104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
107#include <net/xfrm.h>
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/module.h>
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
114#include <net/iw_handler.h>
115#include <asm/current.h>
116#include <linux/audit.h>
117#include <linux/dmaengine.h>
118#include <linux/err.h>
119#include <linux/ctype.h>
120#include <linux/if_arp.h>
121#include <linux/if_vlan.h>
122#include <linux/ip.h>
123#include <net/ip.h>
124#include <linux/ipv6.h>
125#include <linux/in.h>
126#include <linux/jhash.h>
127#include <linux/random.h>
128#include <trace/events/napi.h>
129#include <trace/events/net.h>
130#include <trace/events/skb.h>
131#include <linux/pci.h>
132#include <linux/inetdevice.h>
133#include <linux/cpu_rmap.h>
134#include <linux/static_key.h>
135#include <linux/hashtable.h>
136#include <linux/vmalloc.h>
137#include <linux/if_macvlan.h>
138#include <linux/errqueue.h>
139
140#include "net-sysfs.h"
141
142
143#define BCMFASTPATH
144#define BCMFASTPATH_HOST
145
146/* Instead of increasing this, you should create a hash table. */
147#define MAX_GRO_SKBS 8
148
149/* This should be increased if a protocol with a bigger head is added. */
150#define GRO_MAX_HEAD (MAX_HEADER + 128)
151
152static DEFINE_SPINLOCK(ptype_lock);
153static DEFINE_SPINLOCK(offload_lock);
154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155struct list_head ptype_all __read_mostly;       /* Taps */
156static struct list_head offload_base __read_mostly;
157
158static int netif_rx_internal(struct sk_buff *skb);
159static int call_netdevice_notifiers_info(unsigned long val,
160                                         struct net_device *dev,
161                                         struct netdev_notifier_info *info);
162
163/*
164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165 * semaphore.
166 *
167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168 *
169 * Writers must hold the rtnl semaphore while they loop through the
170 * dev_base_head list, and hold dev_base_lock for writing when they do the
171 * actual updates.  This allows pure readers to access the list even
172 * while a writer is preparing to update it.
173 *
174 * To put it another way, dev_base_lock is held for writing only to
175 * protect against pure readers; the rtnl semaphore provides the
176 * protection against other writers.
177 *
178 * See, for example usages, register_netdevice() and
179 * unregister_netdevice(), which must be called with the rtnl
180 * semaphore held.
181 */
182DEFINE_RWLOCK(dev_base_lock);
183EXPORT_SYMBOL(dev_base_lock);
184
185/* protects napi_hash addition/deletion and napi_gen_id */
186static DEFINE_SPINLOCK(napi_hash_lock);
187
188static unsigned int napi_gen_id;
189static DEFINE_HASHTABLE(napi_hash, 8);
190
191static seqcount_t devnet_rename_seq;
192
193static inline void dev_base_seq_inc(struct net *net)
194{
195        while (++net->dev_base_seq == 0);
196}
197
198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199{
200        unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
201
202        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203}
204
205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206{
207        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208}
209
210static inline void rps_lock(struct softnet_data *sd)
211{
212#ifdef CONFIG_RPS
213        spin_lock(&sd->input_pkt_queue.lock);
214#endif
215}
216
217static inline void rps_unlock(struct softnet_data *sd)
218{
219#ifdef CONFIG_RPS
220        spin_unlock(&sd->input_pkt_queue.lock);
221#endif
222}
223
224/* Device list insertion */
225static void list_netdevice(struct net_device *dev)
226{
227        struct net *net = dev_net(dev);
228
229        ASSERT_RTNL();
230
231        write_lock_bh(&dev_base_lock);
232        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234        hlist_add_head_rcu(&dev->index_hlist,
235                           dev_index_hash(net, dev->ifindex));
236        write_unlock_bh(&dev_base_lock);
237
238        dev_base_seq_inc(net);
239}
240
241/* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
244static void unlist_netdevice(struct net_device *dev)
245{
246        ASSERT_RTNL();
247
248        /* Unlink dev from the device chain */
249        write_lock_bh(&dev_base_lock);
250        list_del_rcu(&dev->dev_list);
251        hlist_del_rcu(&dev->name_hlist);
252        hlist_del_rcu(&dev->index_hlist);
253        write_unlock_bh(&dev_base_lock);
254
255        dev_base_seq_inc(dev_net(dev));
256}
257
258/*
259 *      Our notifier list
260 */
261
262static RAW_NOTIFIER_HEAD(netdev_chain);
263
264/*
265 *      Device drivers call our routines to queue packets here. We empty the
266 *      queue in the local softnet handler.
267 */
268
269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272#ifdef CONFIG_LOCKDEP
273/*
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
276 */
277static const unsigned short netdev_lock_type[] =
278        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293
294static const char *const netdev_lock_name[] =
295        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313
314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315{
316        int i;
317
318        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319                if (netdev_lock_type[i] == dev_type)
320                        return i;
321        /* the last key is used by default */
322        return ARRAY_SIZE(netdev_lock_type) - 1;
323}
324
325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326                                                 unsigned short dev_type)
327{
328        int i;
329
330        i = netdev_lock_pos(dev_type);
331        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332                                   netdev_lock_name[i]);
333}
334
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336{
337        int i;
338
339        i = netdev_lock_pos(dev->type);
340        lockdep_set_class_and_name(&dev->addr_list_lock,
341                                   &netdev_addr_lock_key[i],
342                                   netdev_lock_name[i]);
343}
344#else
345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346                                                 unsigned short dev_type)
347{
348}
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350{
351}
352#endif
353
354/*******************************************************************************
355
356                Protocol management and registration routines
357
358*******************************************************************************/
359
360/*
361 *      Add a protocol ID to the list. Now that the input handler is
362 *      smarter we can dispense with all the messy stuff that used to be
363 *      here.
364 *
365 *      BEWARE!!! Protocol handlers, mangling input packets,
366 *      MUST BE last in hash buckets and checking protocol handlers
367 *      MUST start from promiscuous ptype_all chain in net_bh.
368 *      It is true now, do not change it.
369 *      Explanation follows: if protocol handler, mangling packet, will
370 *      be the first on list, it is not able to sense, that packet
371 *      is cloned and should be copied-on-write, so that it will
372 *      change it and subsequent readers will get broken packet.
373 *                                                      --ANK (980803)
374 */
375
376static inline struct list_head *ptype_head(const struct packet_type *pt)
377{
378        if (pt->type == htons(ETH_P_ALL))
379                return &ptype_all;
380        else
381                return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
382}
383
384/**
385 *      dev_add_pack - add packet handler
386 *      @pt: packet type declaration
387 *
388 *      Add a protocol handler to the networking stack. The passed &packet_type
389 *      is linked into kernel lists and may not be freed until it has been
390 *      removed from the kernel lists.
391 *
392 *      This call does not sleep therefore it can not
393 *      guarantee all CPU's that are in middle of receiving packets
394 *      will see the new packet type (until the next received packet).
395 */
396
397void dev_add_pack(struct packet_type *pt)
398{
399        struct list_head *head = ptype_head(pt);
400
401        spin_lock(&ptype_lock);
402        list_add_rcu(&pt->list, head);
403        spin_unlock(&ptype_lock);
404}
405EXPORT_SYMBOL(dev_add_pack);
406
407/**
408 *      __dev_remove_pack        - remove packet handler
409 *      @pt: packet type declaration
410 *
411 *      Remove a protocol handler that was previously added to the kernel
412 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
413 *      from the kernel lists and can be freed or reused once this function
414 *      returns.
415 *
416 *      The packet type might still be in use by receivers
417 *      and must not be freed until after all the CPU's have gone
418 *      through a quiescent state.
419 */
420void __dev_remove_pack(struct packet_type *pt)
421{
422        struct list_head *head = ptype_head(pt);
423        struct packet_type *pt1;
424
425        spin_lock(&ptype_lock);
426
427        list_for_each_entry(pt1, head, list) {
428                if (pt == pt1) {
429                        list_del_rcu(&pt->list);
430                        goto out;
431                }
432        }
433
434        pr_warn("dev_remove_pack: %p not found\n", pt);
435out:
436        spin_unlock(&ptype_lock);
437}
438EXPORT_SYMBOL(__dev_remove_pack);
439
440/**
441 *      dev_remove_pack  - remove packet handler
442 *      @pt: packet type declaration
443 *
444 *      Remove a protocol handler that was previously added to the kernel
445 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
446 *      from the kernel lists and can be freed or reused once this function
447 *      returns.
448 *
449 *      This call sleeps to guarantee that no CPU is looking at the packet
450 *      type after return.
451 */
452void dev_remove_pack(struct packet_type *pt)
453{
454        __dev_remove_pack(pt);
455
456        synchronize_net();
457}
458EXPORT_SYMBOL(dev_remove_pack);
459
460
461/**
462 *      dev_add_offload - register offload handlers
463 *      @po: protocol offload declaration
464 *
465 *      Add protocol offload handlers to the networking stack. The passed
466 *      &proto_offload is linked into kernel lists and may not be freed until
467 *      it has been removed from the kernel lists.
468 *
469 *      This call does not sleep therefore it can not
470 *      guarantee all CPU's that are in middle of receiving packets
471 *      will see the new offload handlers (until the next received packet).
472 */
473void dev_add_offload(struct packet_offload *po)
474{
475        struct list_head *head = &offload_base;
476
477        spin_lock(&offload_lock);
478        list_add_rcu(&po->list, head);
479        spin_unlock(&offload_lock);
480}
481EXPORT_SYMBOL(dev_add_offload);
482
483/**
484 *      __dev_remove_offload     - remove offload handler
485 *      @po: packet offload declaration
486 *
487 *      Remove a protocol offload handler that was previously added to the
488 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
489 *      is removed from the kernel lists and can be freed or reused once this
490 *      function returns.
491 *
492 *      The packet type might still be in use by receivers
493 *      and must not be freed until after all the CPU's have gone
494 *      through a quiescent state.
495 */
496static void __dev_remove_offload(struct packet_offload *po)
497{
498        struct list_head *head = &offload_base;
499        struct packet_offload *po1;
500
501        spin_lock(&offload_lock);
502
503        list_for_each_entry(po1, head, list) {
504                if (po == po1) {
505                        list_del_rcu(&po->list);
506                        goto out;
507                }
508        }
509
510        pr_warn("dev_remove_offload: %p not found\n", po);
511out:
512        spin_unlock(&offload_lock);
513}
514
515/**
516 *      dev_remove_offload       - remove packet offload handler
517 *      @po: packet offload declaration
518 *
519 *      Remove a packet offload handler that was previously added to the kernel
520 *      offload handlers by dev_add_offload(). The passed &offload_type is
521 *      removed from the kernel lists and can be freed or reused once this
522 *      function returns.
523 *
524 *      This call sleeps to guarantee that no CPU is looking at the packet
525 *      type after return.
526 */
527void dev_remove_offload(struct packet_offload *po)
528{
529        __dev_remove_offload(po);
530
531        synchronize_net();
532}
533EXPORT_SYMBOL(dev_remove_offload);
534
535/******************************************************************************
536
537                      Device Boot-time Settings Routines
538
539*******************************************************************************/
540
541/* Boot time configuration table */
542static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
543
544/**
545 *      netdev_boot_setup_add   - add new setup entry
546 *      @name: name of the device
547 *      @map: configured settings for the device
548 *
549 *      Adds new setup entry to the dev_boot_setup list.  The function
550 *      returns 0 on error and 1 on success.  This is a generic routine to
551 *      all netdevices.
552 */
553static int netdev_boot_setup_add(char *name, struct ifmap *map)
554{
555        struct netdev_boot_setup *s;
556        int i;
557
558        s = dev_boot_setup;
559        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
560                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
561                        memset(s[i].name, 0, sizeof(s[i].name));
562                        strlcpy(s[i].name, name, IFNAMSIZ);
563                        memcpy(&s[i].map, map, sizeof(s[i].map));
564                        break;
565                }
566        }
567
568        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
569}
570
571/**
572 *      netdev_boot_setup_check - check boot time settings
573 *      @dev: the netdevice
574 *
575 *      Check boot time settings for the device.
576 *      The found settings are set for the device to be used
577 *      later in the device probing.
578 *      Returns 0 if no settings found, 1 if they are.
579 */
580int netdev_boot_setup_check(struct net_device *dev)
581{
582        struct netdev_boot_setup *s = dev_boot_setup;
583        int i;
584
585        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
586                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
587                    !strcmp(dev->name, s[i].name)) {
588                        dev->irq        = s[i].map.irq;
589                        dev->base_addr  = s[i].map.base_addr;
590                        dev->mem_start  = s[i].map.mem_start;
591                        dev->mem_end    = s[i].map.mem_end;
592                        return 1;
593                }
594        }
595        return 0;
596}
597EXPORT_SYMBOL(netdev_boot_setup_check);
598
599
600/**
601 *      netdev_boot_base        - get address from boot time settings
602 *      @prefix: prefix for network device
603 *      @unit: id for network device
604 *
605 *      Check boot time settings for the base address of device.
606 *      The found settings are set for the device to be used
607 *      later in the device probing.
608 *      Returns 0 if no settings found.
609 */
610unsigned long netdev_boot_base(const char *prefix, int unit)
611{
612        const struct netdev_boot_setup *s = dev_boot_setup;
613        char name[IFNAMSIZ];
614        int i;
615
616        sprintf(name, "%s%d", prefix, unit);
617
618        /*
619         * If device already registered then return base of 1
620         * to indicate not to probe for this interface
621         */
622        if (__dev_get_by_name(&init_net, name))
623                return 1;
624
625        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
626                if (!strcmp(name, s[i].name))
627                        return s[i].map.base_addr;
628        return 0;
629}
630
631/*
632 * Saves at boot time configured settings for any netdevice.
633 */
634int __init netdev_boot_setup(char *str)
635{
636        int ints[5];
637        struct ifmap map;
638
639        str = get_options(str, ARRAY_SIZE(ints), ints);
640        if (!str || !*str)
641                return 0;
642
643        /* Save settings */
644        memset(&map, 0, sizeof(map));
645        if (ints[0] > 0)
646                map.irq = ints[1];
647        if (ints[0] > 1)
648                map.base_addr = ints[2];
649        if (ints[0] > 2)
650                map.mem_start = ints[3];
651        if (ints[0] > 3)
652                map.mem_end = ints[4];
653
654        /* Add new entry to the list */
655        return netdev_boot_setup_add(str, &map);
656}
657
658__setup("netdev=", netdev_boot_setup);
659
660/*******************************************************************************
661
662                            Device Interface Subroutines
663
664*******************************************************************************/
665
666/**
667 *      __dev_get_by_name       - find a device by its name
668 *      @net: the applicable net namespace
669 *      @name: name to find
670 *
671 *      Find an interface by name. Must be called under RTNL semaphore
672 *      or @dev_base_lock. If the name is found a pointer to the device
673 *      is returned. If the name is not found then %NULL is returned. The
674 *      reference counters are not incremented so the caller must be
675 *      careful with locks.
676 */
677
678struct net_device *__dev_get_by_name(struct net *net, const char *name)
679{
680        struct net_device *dev;
681        struct hlist_head *head = dev_name_hash(net, name);
682
683        hlist_for_each_entry(dev, head, name_hlist)
684                if (!strncmp(dev->name, name, IFNAMSIZ))
685                        return dev;
686
687        return NULL;
688}
689EXPORT_SYMBOL(__dev_get_by_name);
690
691/**
692 *      dev_get_by_name_rcu     - find a device by its name
693 *      @net: the applicable net namespace
694 *      @name: name to find
695 *
696 *      Find an interface by name.
697 *      If the name is found a pointer to the device is returned.
698 *      If the name is not found then %NULL is returned.
699 *      The reference counters are not incremented so the caller must be
700 *      careful with locks. The caller must hold RCU lock.
701 */
702
703struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
704{
705        struct net_device *dev;
706        struct hlist_head *head = dev_name_hash(net, name);
707
708        hlist_for_each_entry_rcu(dev, head, name_hlist)
709                if (!strncmp(dev->name, name, IFNAMSIZ))
710                        return dev;
711
712        return NULL;
713}
714EXPORT_SYMBOL(dev_get_by_name_rcu);
715
716/**
717 *      dev_get_by_name         - find a device by its name
718 *      @net: the applicable net namespace
719 *      @name: name to find
720 *
721 *      Find an interface by name. This can be called from any
722 *      context and does its own locking. The returned handle has
723 *      the usage count incremented and the caller must use dev_put() to
724 *      release it when it is no longer needed. %NULL is returned if no
725 *      matching device is found.
726 */
727
728struct net_device *dev_get_by_name(struct net *net, const char *name)
729{
730        struct net_device *dev;
731
732        rcu_read_lock();
733        dev = dev_get_by_name_rcu(net, name);
734        if (dev)
735                dev_hold(dev);
736        rcu_read_unlock();
737        return dev;
738}
739EXPORT_SYMBOL(dev_get_by_name);
740
741/**
742 *      __dev_get_by_index - find a device by its ifindex
743 *      @net: the applicable net namespace
744 *      @ifindex: index of device
745 *
746 *      Search for an interface by index. Returns %NULL if the device
747 *      is not found or a pointer to the device. The device has not
748 *      had its reference counter increased so the caller must be careful
749 *      about locking. The caller must hold either the RTNL semaphore
750 *      or @dev_base_lock.
751 */
752
753struct net_device *__dev_get_by_index(struct net *net, int ifindex)
754{
755        struct net_device *dev;
756        struct hlist_head *head = dev_index_hash(net, ifindex);
757
758        hlist_for_each_entry(dev, head, index_hlist)
759                if (dev->ifindex == ifindex)
760                        return dev;
761
762        return NULL;
763}
764EXPORT_SYMBOL(__dev_get_by_index);
765
766/**
767 *      dev_get_by_index_rcu - find a device by its ifindex
768 *      @net: the applicable net namespace
769 *      @ifindex: index of device
770 *
771 *      Search for an interface by index. Returns %NULL if the device
772 *      is not found or a pointer to the device. The device has not
773 *      had its reference counter increased so the caller must be careful
774 *      about locking. The caller must hold RCU lock.
775 */
776
777struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
778{
779        struct net_device *dev;
780        struct hlist_head *head = dev_index_hash(net, ifindex);
781
782        hlist_for_each_entry_rcu(dev, head, index_hlist)
783                if (dev->ifindex == ifindex)
784                        return dev;
785
786        return NULL;
787}
788EXPORT_SYMBOL(dev_get_by_index_rcu);
789
790
791/**
792 *      dev_get_by_index - find a device by its ifindex
793 *      @net: the applicable net namespace
794 *      @ifindex: index of device
795 *
796 *      Search for an interface by index. Returns NULL if the device
797 *      is not found or a pointer to the device. The device returned has
798 *      had a reference added and the pointer is safe until the user calls
799 *      dev_put to indicate they have finished with it.
800 */
801
802struct net_device *dev_get_by_index(struct net *net, int ifindex)
803{
804        struct net_device *dev;
805
806        rcu_read_lock();
807        dev = dev_get_by_index_rcu(net, ifindex);
808        if (dev)
809                dev_hold(dev);
810        rcu_read_unlock();
811        return dev;
812}
813EXPORT_SYMBOL(dev_get_by_index);
814
815/**
816 *      netdev_get_name - get a netdevice name, knowing its ifindex.
817 *      @net: network namespace
818 *      @name: a pointer to the buffer where the name will be stored.
819 *      @ifindex: the ifindex of the interface to get the name from.
820 *
821 *      The use of raw_seqcount_begin() and cond_resched() before
822 *      retrying is required as we want to give the writers a chance
823 *      to complete when CONFIG_PREEMPT is not set.
824 */
825int netdev_get_name(struct net *net, char *name, int ifindex)
826{
827        struct net_device *dev;
828        unsigned int seq;
829
830retry:
831        seq = raw_seqcount_begin(&devnet_rename_seq);
832        rcu_read_lock();
833        dev = dev_get_by_index_rcu(net, ifindex);
834        if (!dev) {
835                rcu_read_unlock();
836                return -ENODEV;
837        }
838
839        strcpy(name, dev->name);
840        rcu_read_unlock();
841        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
842                cond_resched();
843                goto retry;
844        }
845
846        return 0;
847}
848
849/**
850 *      dev_getbyhwaddr_rcu - find a device by its hardware address
851 *      @net: the applicable net namespace
852 *      @type: media type of device
853 *      @ha: hardware address
854 *
855 *      Search for an interface by MAC address. Returns NULL if the device
856 *      is not found or a pointer to the device.
857 *      The caller must hold RCU or RTNL.
858 *      The returned device has not had its ref count increased
859 *      and the caller must therefore be careful about locking
860 *
861 */
862
863struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
864                                       const char *ha)
865{
866        struct net_device *dev;
867
868        for_each_netdev_rcu(net, dev)
869                if (dev->type == type &&
870                    !memcmp(dev->dev_addr, ha, dev->addr_len))
871                        return dev;
872
873        return NULL;
874}
875EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
876
877struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
878{
879        struct net_device *dev;
880
881        ASSERT_RTNL();
882        for_each_netdev(net, dev)
883                if (dev->type == type)
884                        return dev;
885
886        return NULL;
887}
888EXPORT_SYMBOL(__dev_getfirstbyhwtype);
889
890struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
891{
892        struct net_device *dev, *ret = NULL;
893
894        rcu_read_lock();
895        for_each_netdev_rcu(net, dev)
896                if (dev->type == type) {
897                        dev_hold(dev);
898                        ret = dev;
899                        break;
900                }
901        rcu_read_unlock();
902        return ret;
903}
904EXPORT_SYMBOL(dev_getfirstbyhwtype);
905
906/**
907 *      __dev_get_by_flags - find any device with given flags
908 *      @net: the applicable net namespace
909 *      @if_flags: IFF_* values
910 *      @mask: bitmask of bits in if_flags to check
911 *
912 *      Search for any interface with the given flags. Returns NULL if a device
913 *      is not found or a pointer to the device. Must be called inside
914 *      rtnl_lock(), and result refcount is unchanged.
915 */
916
917struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
918                                      unsigned short mask)
919{
920        struct net_device *dev, *ret;
921
922        ASSERT_RTNL();
923
924        ret = NULL;
925        for_each_netdev(net, dev) {
926                if (((dev->flags ^ if_flags) & mask) == 0) {
927                        ret = dev;
928                        break;
929                }
930        }
931        return ret;
932}
933EXPORT_SYMBOL(__dev_get_by_flags);
934
935/**
936 *      dev_valid_name - check if name is okay for network device
937 *      @name: name string
938 *
939 *      Network device names need to be valid file names to
940 *      to allow sysfs to work.  We also disallow any kind of
941 *      whitespace.
942 */
943bool dev_valid_name(const char *name)
944{
945        if (*name == '\0')
946                return false;
947        if (strlen(name) >= IFNAMSIZ)
948                return false;
949        if (!strcmp(name, ".") || !strcmp(name, ".."))
950                return false;
951
952        while (*name) {
953                if (*name == '/' || *name == ':' || isspace(*name))
954                        return false;
955                name++;
956        }
957        return true;
958}
959EXPORT_SYMBOL(dev_valid_name);
960
961/**
962 *      __dev_alloc_name - allocate a name for a device
963 *      @net: network namespace to allocate the device name in
964 *      @name: name format string
965 *      @buf:  scratch buffer and result name string
966 *
967 *      Passed a format string - eg "lt%d" it will try and find a suitable
968 *      id. It scans list of devices to build up a free map, then chooses
969 *      the first empty slot. The caller must hold the dev_base or rtnl lock
970 *      while allocating the name and adding the device in order to avoid
971 *      duplicates.
972 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
973 *      Returns the number of the unit assigned or a negative errno code.
974 */
975
976static int __dev_alloc_name(struct net *net, const char *name, char *buf)
977{
978        int i = 0;
979        const char *p;
980        const int max_netdevices = 8*PAGE_SIZE;
981        unsigned long *inuse;
982        struct net_device *d;
983
984        p = strnchr(name, IFNAMSIZ-1, '%');
985        if (p) {
986                /*
987                 * Verify the string as this thing may have come from
988                 * the user.  There must be either one "%d" and no other "%"
989                 * characters.
990                 */
991                if (p[1] != 'd' || strchr(p + 2, '%'))
992                        return -EINVAL;
993
994                /* Use one page as a bit array of possible slots */
995                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
996                if (!inuse)
997                        return -ENOMEM;
998
999                for_each_netdev(net, d) {
1000                        if (!sscanf(d->name, name, &i))
1001                                continue;
1002                        if (i < 0 || i >= max_netdevices)
1003                                continue;
1004
1005                        /*  avoid cases where sscanf is not exact inverse of printf */
1006                        snprintf(buf, IFNAMSIZ, name, i);
1007                        if (!strncmp(buf, d->name, IFNAMSIZ))
1008                                set_bit(i, inuse);
1009                }
1010
1011                i = find_first_zero_bit(inuse, max_netdevices);
1012                free_page((unsigned long) inuse);
1013        }
1014
1015        if (buf != name)
1016                snprintf(buf, IFNAMSIZ, name, i);
1017        if (!__dev_get_by_name(net, buf))
1018                return i;
1019
1020        /* It is possible to run out of possible slots
1021         * when the name is long and there isn't enough space left
1022         * for the digits, or if all bits are used.
1023         */
1024        return -ENFILE;
1025}
1026
1027/**
1028 *      dev_alloc_name - allocate a name for a device
1029 *      @dev: device
1030 *      @name: name format string
1031 *
1032 *      Passed a format string - eg "lt%d" it will try and find a suitable
1033 *      id. It scans list of devices to build up a free map, then chooses
1034 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1035 *      while allocating the name and adding the device in order to avoid
1036 *      duplicates.
1037 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1038 *      Returns the number of the unit assigned or a negative errno code.
1039 */
1040
1041int dev_alloc_name(struct net_device *dev, const char *name)
1042{
1043        char buf[IFNAMSIZ];
1044        struct net *net;
1045        int ret;
1046
1047        BUG_ON(!dev_net(dev));
1048        net = dev_net(dev);
1049        ret = __dev_alloc_name(net, name, buf);
1050        if (ret >= 0)
1051                strlcpy(dev->name, buf, IFNAMSIZ);
1052        return ret;
1053}
1054EXPORT_SYMBOL(dev_alloc_name);
1055
1056static int dev_alloc_name_ns(struct net *net,
1057                             struct net_device *dev,
1058                             const char *name)
1059{
1060        char buf[IFNAMSIZ];
1061        int ret;
1062
1063        ret = __dev_alloc_name(net, name, buf);
1064        if (ret >= 0)
1065                strlcpy(dev->name, buf, IFNAMSIZ);
1066        return ret;
1067}
1068
1069static int dev_get_valid_name(struct net *net,
1070                              struct net_device *dev,
1071                              const char *name)
1072{
1073        BUG_ON(!net);
1074
1075        if (!dev_valid_name(name))
1076                return -EINVAL;
1077
1078        if (strchr(name, '%'))
1079                return dev_alloc_name_ns(net, dev, name);
1080        else if (__dev_get_by_name(net, name))
1081                return -EEXIST;
1082        else if (dev->name != name)
1083                strlcpy(dev->name, name, IFNAMSIZ);
1084
1085        return 0;
1086}
1087
1088/**
1089 *      dev_change_name - change name of a device
1090 *      @dev: device
1091 *      @newname: name (or format string) must be at least IFNAMSIZ
1092 *
1093 *      Change name of a device, can pass format strings "eth%d".
1094 *      for wildcarding.
1095 */
1096int dev_change_name(struct net_device *dev, const char *newname)
1097{
1098        unsigned char old_assign_type;
1099        char oldname[IFNAMSIZ];
1100        int err = 0;
1101        int ret;
1102        struct net *net;
1103
1104        ASSERT_RTNL();
1105        BUG_ON(!dev_net(dev));
1106
1107        net = dev_net(dev);
1108        if (dev->flags & IFF_UP)
1109                return -EBUSY;
1110
1111        write_seqcount_begin(&devnet_rename_seq);
1112
1113        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1114                write_seqcount_end(&devnet_rename_seq);
1115                return 0;
1116        }
1117
1118        memcpy(oldname, dev->name, IFNAMSIZ);
1119
1120        err = dev_get_valid_name(net, dev, newname);
1121        if (err < 0) {
1122                write_seqcount_end(&devnet_rename_seq);
1123                return err;
1124        }
1125
1126        if (oldname[0] && !strchr(oldname, '%'))
1127                netdev_info(dev, "renamed from %s\n", oldname);
1128
1129        old_assign_type = dev->name_assign_type;
1130        dev->name_assign_type = NET_NAME_RENAMED;
1131
1132rollback:
1133        ret = device_rename(&dev->dev, dev->name);
1134        if (ret) {
1135                memcpy(dev->name, oldname, IFNAMSIZ);
1136                dev->name_assign_type = old_assign_type;
1137                write_seqcount_end(&devnet_rename_seq);
1138                return ret;
1139        }
1140
1141        write_seqcount_end(&devnet_rename_seq);
1142
1143        netdev_adjacent_rename_links(dev, oldname);
1144
1145        write_lock_bh(&dev_base_lock);
1146        hlist_del_rcu(&dev->name_hlist);
1147        write_unlock_bh(&dev_base_lock);
1148
1149        synchronize_rcu();
1150
1151        write_lock_bh(&dev_base_lock);
1152        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1153        write_unlock_bh(&dev_base_lock);
1154
1155        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1156        ret = notifier_to_errno(ret);
1157
1158        if (ret) {
1159                /* err >= 0 after dev_alloc_name() or stores the first errno */
1160                if (err >= 0) {
1161                        err = ret;
1162                        write_seqcount_begin(&devnet_rename_seq);
1163                        memcpy(dev->name, oldname, IFNAMSIZ);
1164                        memcpy(oldname, newname, IFNAMSIZ);
1165                        dev->name_assign_type = old_assign_type;
1166                        old_assign_type = NET_NAME_RENAMED;
1167                        goto rollback;
1168                } else {
1169                        pr_err("%s: name change rollback failed: %d\n",
1170                               dev->name, ret);
1171                }
1172        }
1173
1174        return err;
1175}
1176
1177/**
1178 *      dev_set_alias - change ifalias of a device
1179 *      @dev: device
1180 *      @alias: name up to IFALIASZ
1181 *      @len: limit of bytes to copy from info
1182 *
1183 *      Set ifalias for a device,
1184 */
1185int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1186{
1187        char *new_ifalias;
1188
1189        ASSERT_RTNL();
1190
1191        if (len >= IFALIASZ)
1192                return -EINVAL;
1193
1194        if (!len) {
1195                kfree(dev->ifalias);
1196                dev->ifalias = NULL;
1197                return 0;
1198        }
1199
1200        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1201        if (!new_ifalias)
1202                return -ENOMEM;
1203        dev->ifalias = new_ifalias;
1204
1205        strlcpy(dev->ifalias, alias, len+1);
1206        return len;
1207}
1208
1209
1210/**
1211 *      netdev_features_change - device changes features
1212 *      @dev: device to cause notification
1213 *
1214 *      Called to indicate a device has changed features.
1215 */
1216void netdev_features_change(struct net_device *dev)
1217{
1218        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1219}
1220EXPORT_SYMBOL(netdev_features_change);
1221
1222/**
1223 *      netdev_state_change - device changes state
1224 *      @dev: device to cause notification
1225 *
1226 *      Called to indicate a device has changed state. This function calls
1227 *      the notifier chains for netdev_chain and sends a NEWLINK message
1228 *      to the routing socket.
1229 */
1230void netdev_state_change(struct net_device *dev)
1231{
1232        if (dev->flags & IFF_UP) {
1233                struct netdev_notifier_change_info change_info;
1234
1235                change_info.flags_changed = 0;
1236                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1237                                              &change_info.info);
1238                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1239        }
1240}
1241EXPORT_SYMBOL(netdev_state_change);
1242
1243/**
1244 *      netdev_notify_peers - notify network peers about existence of @dev
1245 *      @dev: network device
1246 *
1247 * Generate traffic such that interested network peers are aware of
1248 * @dev, such as by generating a gratuitous ARP. This may be used when
1249 * a device wants to inform the rest of the network about some sort of
1250 * reconfiguration such as a failover event or virtual machine
1251 * migration.
1252 */
1253void netdev_notify_peers(struct net_device *dev)
1254{
1255        rtnl_lock();
1256        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1257        rtnl_unlock();
1258}
1259EXPORT_SYMBOL(netdev_notify_peers);
1260
1261static int __dev_open(struct net_device *dev)
1262{
1263        const struct net_device_ops *ops = dev->netdev_ops;
1264        int ret;
1265
1266        ASSERT_RTNL();
1267
1268        if (!netif_device_present(dev))
1269                return -ENODEV;
1270
1271        /* Block netpoll from trying to do any rx path servicing.
1272         * If we don't do this there is a chance ndo_poll_controller
1273         * or ndo_poll may be running while we open the device
1274         */
1275        netpoll_poll_disable(dev);
1276
1277        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1278        ret = notifier_to_errno(ret);
1279        if (ret)
1280                return ret;
1281
1282        set_bit(__LINK_STATE_START, &dev->state);
1283
1284        if (ops->ndo_validate_addr)
1285                ret = ops->ndo_validate_addr(dev);
1286
1287        if (!ret && ops->ndo_open)
1288                ret = ops->ndo_open(dev);
1289
1290        netpoll_poll_enable(dev);
1291
1292        if (ret)
1293                clear_bit(__LINK_STATE_START, &dev->state);
1294        else {
1295                dev->flags |= IFF_UP;
1296                dev_set_rx_mode(dev);
1297                dev_activate(dev);
1298                add_device_randomness(dev->dev_addr, dev->addr_len);
1299        }
1300
1301        return ret;
1302}
1303
1304/**
1305 *      dev_open        - prepare an interface for use.
1306 *      @dev:   device to open
1307 *
1308 *      Takes a device from down to up state. The device's private open
1309 *      function is invoked and then the multicast lists are loaded. Finally
1310 *      the device is moved into the up state and a %NETDEV_UP message is
1311 *      sent to the netdev notifier chain.
1312 *
1313 *      Calling this function on an active interface is a nop. On a failure
1314 *      a negative errno code is returned.
1315 */
1316int dev_open(struct net_device *dev)
1317{
1318        int ret;
1319
1320        if (dev->flags & IFF_UP)
1321                return 0;
1322
1323        ret = __dev_open(dev);
1324        if (ret < 0)
1325                return ret;
1326
1327        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1328        call_netdevice_notifiers(NETDEV_UP, dev);
1329
1330        return ret;
1331}
1332EXPORT_SYMBOL(dev_open);
1333
1334static int __dev_close_many(struct list_head *head)
1335{
1336        struct net_device *dev;
1337
1338        ASSERT_RTNL();
1339        might_sleep();
1340
1341        list_for_each_entry(dev, head, close_list) {
1342                /* Temporarily disable netpoll until the interface is down */
1343                netpoll_poll_disable(dev);
1344
1345                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1346
1347                clear_bit(__LINK_STATE_START, &dev->state);
1348
1349                /* Synchronize to scheduled poll. We cannot touch poll list, it
1350                 * can be even on different cpu. So just clear netif_running().
1351                 *
1352                 * dev->stop() will invoke napi_disable() on all of it's
1353                 * napi_struct instances on this device.
1354                 */
1355                smp_mb__after_atomic(); /* Commit netif_running(). */
1356        }
1357
1358        dev_deactivate_many(head);
1359
1360        list_for_each_entry(dev, head, close_list) {
1361                const struct net_device_ops *ops = dev->netdev_ops;
1362
1363                /*
1364                 *      Call the device specific close. This cannot fail.
1365                 *      Only if device is UP
1366                 *
1367                 *      We allow it to be called even after a DETACH hot-plug
1368                 *      event.
1369                 */
1370                if (ops->ndo_stop)
1371                        ops->ndo_stop(dev);
1372
1373                dev->flags &= ~IFF_UP;
1374                netpoll_poll_enable(dev);
1375        }
1376
1377        return 0;
1378}
1379
1380static int __dev_close(struct net_device *dev)
1381{
1382        int retval;
1383        LIST_HEAD(single);
1384
1385        list_add(&dev->close_list, &single);
1386        retval = __dev_close_many(&single);
1387        list_del(&single);
1388
1389        return retval;
1390}
1391
1392static int dev_close_many(struct list_head *head)
1393{
1394        struct net_device *dev, *tmp;
1395
1396        /* Remove the devices that don't need to be closed */
1397        list_for_each_entry_safe(dev, tmp, head, close_list)
1398                if (!(dev->flags & IFF_UP))
1399                        list_del_init(&dev->close_list);
1400
1401        __dev_close_many(head);
1402
1403        list_for_each_entry_safe(dev, tmp, head, close_list) {
1404                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1405                call_netdevice_notifiers(NETDEV_DOWN, dev);
1406                list_del_init(&dev->close_list);
1407        }
1408
1409        return 0;
1410}
1411
1412/**
1413 *      dev_close - shutdown an interface.
1414 *      @dev: device to shutdown
1415 *
1416 *      This function moves an active device into down state. A
1417 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1418 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1419 *      chain.
1420 */
1421int dev_close(struct net_device *dev)
1422{
1423        if (dev->flags & IFF_UP) {
1424                LIST_HEAD(single);
1425
1426                list_add(&dev->close_list, &single);
1427                dev_close_many(&single);
1428                list_del(&single);
1429        }
1430        return 0;
1431}
1432EXPORT_SYMBOL(dev_close);
1433
1434
1435/**
1436 *      dev_disable_lro - disable Large Receive Offload on a device
1437 *      @dev: device
1438 *
1439 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1440 *      called under RTNL.  This is needed if received packets may be
1441 *      forwarded to another interface.
1442 */
1443void dev_disable_lro(struct net_device *dev)
1444{
1445        /*
1446         * If we're trying to disable lro on a vlan device
1447         * use the underlying physical device instead
1448         */
1449        if (is_vlan_dev(dev))
1450                dev = vlan_dev_real_dev(dev);
1451
1452        /* the same for macvlan devices */
1453        if (netif_is_macvlan(dev))
1454                dev = macvlan_dev_real_dev(dev);
1455
1456        dev->wanted_features &= ~NETIF_F_LRO;
1457        netdev_update_features(dev);
1458
1459        if (unlikely(dev->features & NETIF_F_LRO))
1460                netdev_WARN(dev, "failed to disable LRO!\n");
1461}
1462EXPORT_SYMBOL(dev_disable_lro);
1463
1464static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1465                                   struct net_device *dev)
1466{
1467        struct netdev_notifier_info info;
1468
1469        netdev_notifier_info_init(&info, dev);
1470        return nb->notifier_call(nb, val, &info);
1471}
1472
1473static int dev_boot_phase = 1;
1474
1475/**
1476 *      register_netdevice_notifier - register a network notifier block
1477 *      @nb: notifier
1478 *
1479 *      Register a notifier to be called when network device events occur.
1480 *      The notifier passed is linked into the kernel structures and must
1481 *      not be reused until it has been unregistered. A negative errno code
1482 *      is returned on a failure.
1483 *
1484 *      When registered all registration and up events are replayed
1485 *      to the new notifier to allow device to have a race free
1486 *      view of the network device list.
1487 */
1488
1489int register_netdevice_notifier(struct notifier_block *nb)
1490{
1491        struct net_device *dev;
1492        struct net_device *last;
1493        struct net *net;
1494        int err;
1495
1496        rtnl_lock();
1497        err = raw_notifier_chain_register(&netdev_chain, nb);
1498        if (err)
1499                goto unlock;
1500        if (dev_boot_phase)
1501                goto unlock;
1502        for_each_net(net) {
1503                for_each_netdev(net, dev) {
1504                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1505                        err = notifier_to_errno(err);
1506                        if (err)
1507                                goto rollback;
1508
1509                        if (!(dev->flags & IFF_UP))
1510                                continue;
1511
1512                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1513                }
1514        }
1515
1516unlock:
1517        rtnl_unlock();
1518        return err;
1519
1520rollback:
1521        last = dev;
1522        for_each_net(net) {
1523                for_each_netdev(net, dev) {
1524                        if (dev == last)
1525                                goto outroll;
1526
1527                        if (dev->flags & IFF_UP) {
1528                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1529                                                        dev);
1530                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1531                        }
1532                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1533                }
1534        }
1535
1536outroll:
1537        raw_notifier_chain_unregister(&netdev_chain, nb);
1538        goto unlock;
1539}
1540EXPORT_SYMBOL(register_netdevice_notifier);
1541
1542/**
1543 *      unregister_netdevice_notifier - unregister a network notifier block
1544 *      @nb: notifier
1545 *
1546 *      Unregister a notifier previously registered by
1547 *      register_netdevice_notifier(). The notifier is unlinked into the
1548 *      kernel structures and may then be reused. A negative errno code
1549 *      is returned on a failure.
1550 *
1551 *      After unregistering unregister and down device events are synthesized
1552 *      for all devices on the device list to the removed notifier to remove
1553 *      the need for special case cleanup code.
1554 */
1555
1556int unregister_netdevice_notifier(struct notifier_block *nb)
1557{
1558        struct net_device *dev;
1559        struct net *net;
1560        int err;
1561
1562        rtnl_lock();
1563        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1564        if (err)
1565                goto unlock;
1566
1567        for_each_net(net) {
1568                for_each_netdev(net, dev) {
1569                        if (dev->flags & IFF_UP) {
1570                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1571                                                        dev);
1572                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1573                        }
1574                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1575                }
1576        }
1577unlock:
1578        rtnl_unlock();
1579        return err;
1580}
1581EXPORT_SYMBOL(unregister_netdevice_notifier);
1582
1583/**
1584 *      call_netdevice_notifiers_info - call all network notifier blocks
1585 *      @val: value passed unmodified to notifier function
1586 *      @dev: net_device pointer passed unmodified to notifier function
1587 *      @info: notifier information data
1588 *
1589 *      Call all network notifier blocks.  Parameters and return value
1590 *      are as for raw_notifier_call_chain().
1591 */
1592
1593static int call_netdevice_notifiers_info(unsigned long val,
1594                                         struct net_device *dev,
1595                                         struct netdev_notifier_info *info)
1596{
1597        ASSERT_RTNL();
1598        netdev_notifier_info_init(info, dev);
1599        return raw_notifier_call_chain(&netdev_chain, val, info);
1600}
1601
1602/**
1603 *      call_netdevice_notifiers - call all network notifier blocks
1604 *      @val: value passed unmodified to notifier function
1605 *      @dev: net_device pointer passed unmodified to notifier function
1606 *
1607 *      Call all network notifier blocks.  Parameters and return value
1608 *      are as for raw_notifier_call_chain().
1609 */
1610
1611int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1612{
1613        struct netdev_notifier_info info;
1614
1615        return call_netdevice_notifiers_info(val, dev, &info);
1616}
1617EXPORT_SYMBOL(call_netdevice_notifiers);
1618
1619static struct static_key netstamp_needed __read_mostly;
1620#ifdef HAVE_JUMP_LABEL
1621static atomic_t netstamp_needed_deferred;
1622static void netstamp_clear(struct work_struct *work)
1623{
1624        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1625
1626        while (deferred--)
1627                static_key_slow_dec(&netstamp_needed);
1628}
1629static DECLARE_WORK(netstamp_work, netstamp_clear);
1630#endif
1631
1632void net_enable_timestamp(void)
1633{
1634        static_key_slow_inc(&netstamp_needed);
1635}
1636EXPORT_SYMBOL(net_enable_timestamp);
1637
1638void net_disable_timestamp(void)
1639{
1640#ifdef HAVE_JUMP_LABEL
1641        /* net_disable_timestamp() can be called from non process context */
1642        atomic_inc(&netstamp_needed_deferred);
1643        schedule_work(&netstamp_work);
1644#else
1645        static_key_slow_dec(&netstamp_needed);
1646#endif
1647}
1648EXPORT_SYMBOL(net_disable_timestamp);
1649
1650static inline void net_timestamp_set(struct sk_buff *skb)
1651{
1652        skb->tstamp.tv64 = 0;
1653        if (static_key_false(&netstamp_needed))
1654                __net_timestamp(skb);
1655}
1656
1657#define net_timestamp_check(COND, SKB)                  \
1658        if (static_key_false(&netstamp_needed)) {               \
1659                if ((COND) && !(SKB)->tstamp.tv64)      \
1660                        __net_timestamp(SKB);           \
1661        }                                               \
1662
1663bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1664{
1665        unsigned int len;
1666
1667        if (!(dev->flags & IFF_UP))
1668                return false;
1669
1670        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1671        if (skb->len <= len)
1672                return true;
1673
1674        /* if TSO is enabled, we don't care about the length as the packet
1675         * could be forwarded without being segmented before
1676         */
1677        if (skb_is_gso(skb))
1678                return true;
1679
1680        return false;
1681}
1682EXPORT_SYMBOL_GPL(is_skb_forwardable);
1683
1684int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1685{
1686        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1687                if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1688                        atomic_long_inc(&dev->rx_dropped);
1689                        kfree_skb(skb);
1690                        return NET_RX_DROP;
1691                }
1692        }
1693
1694        if (unlikely(!is_skb_forwardable(dev, skb))) {
1695                atomic_long_inc(&dev->rx_dropped);
1696                kfree_skb(skb);
1697                return NET_RX_DROP;
1698        }
1699
1700        skb_scrub_packet(skb, true);
1701        skb->protocol = eth_type_trans(skb, dev);
1702        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1703
1704        return 0;
1705}
1706EXPORT_SYMBOL_GPL(__dev_forward_skb);
1707
1708/**
1709 * dev_forward_skb - loopback an skb to another netif
1710 *
1711 * @dev: destination network device
1712 * @skb: buffer to forward
1713 *
1714 * return values:
1715 *      NET_RX_SUCCESS  (no congestion)
1716 *      NET_RX_DROP     (packet was dropped, but freed)
1717 *
1718 * dev_forward_skb can be used for injecting an skb from the
1719 * start_xmit function of one device into the receive queue
1720 * of another device.
1721 *
1722 * The receiving device may be in another namespace, so
1723 * we have to clear all information in the skb that could
1724 * impact namespace isolation.
1725 */
1726int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1727{
1728        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1729}
1730EXPORT_SYMBOL_GPL(dev_forward_skb);
1731
1732static inline int deliver_skb(struct sk_buff *skb,
1733                              struct packet_type *pt_prev,
1734                              struct net_device *orig_dev)
1735{
1736        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1737                return -ENOMEM;
1738        atomic_inc(&skb->users);
1739        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1740}
1741
1742static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1743{
1744        if (!ptype->af_packet_priv || !skb->sk)
1745                return false;
1746
1747        if (ptype->id_match)
1748                return ptype->id_match(ptype, skb->sk);
1749        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1750                return true;
1751
1752        return false;
1753}
1754
1755/*
1756 *      Support routine. Sends outgoing frames to any network
1757 *      taps currently in use.
1758 */
1759
1760static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1761{
1762        struct packet_type *ptype;
1763        struct sk_buff *skb2 = NULL;
1764        struct packet_type *pt_prev = NULL;
1765
1766        rcu_read_lock();
1767        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1768                /* Never send packets back to the socket
1769                 * they originated from - MvS (miquels@drinkel.ow.org)
1770                 */
1771                if ((ptype->dev == dev || !ptype->dev) &&
1772                    (!skb_loop_sk(ptype, skb))) {
1773                        if (pt_prev) {
1774                                deliver_skb(skb2, pt_prev, skb->dev);
1775                                pt_prev = ptype;
1776                                continue;
1777                        }
1778
1779                        skb2 = skb_clone(skb, GFP_ATOMIC);
1780                        if (!skb2)
1781                                break;
1782
1783                        net_timestamp_set(skb2);
1784
1785                        /* skb->nh should be correctly
1786                           set by sender, so that the second statement is
1787                           just protection against buggy protocols.
1788                         */
1789                        skb_reset_mac_header(skb2);
1790
1791                        if (skb_network_header(skb2) < skb2->data ||
1792                            skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1793                                net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1794                                                     ntohs(skb2->protocol),
1795                                                     dev->name);
1796                                skb_reset_network_header(skb2);
1797                        }
1798
1799                        skb2->transport_header = skb2->network_header;
1800                        skb2->pkt_type = PACKET_OUTGOING;
1801                        pt_prev = ptype;
1802                }
1803        }
1804        if (pt_prev)
1805                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1806        rcu_read_unlock();
1807}
1808
1809/**
1810 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1811 * @dev: Network device
1812 * @txq: number of queues available
1813 *
1814 * If real_num_tx_queues is changed the tc mappings may no longer be
1815 * valid. To resolve this verify the tc mapping remains valid and if
1816 * not NULL the mapping. With no priorities mapping to this
1817 * offset/count pair it will no longer be used. In the worst case TC0
1818 * is invalid nothing can be done so disable priority mappings. If is
1819 * expected that drivers will fix this mapping if they can before
1820 * calling netif_set_real_num_tx_queues.
1821 */
1822static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1823{
1824        int i;
1825        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1826
1827        /* If TC0 is invalidated disable TC mapping */
1828        if (tc->offset + tc->count > txq) {
1829                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1830                dev->num_tc = 0;
1831                return;
1832        }
1833
1834        /* Invalidated prio to tc mappings set to TC0 */
1835        for (i = 1; i < TC_BITMASK + 1; i++) {
1836                int q = netdev_get_prio_tc_map(dev, i);
1837
1838                tc = &dev->tc_to_txq[q];
1839                if (tc->offset + tc->count > txq) {
1840                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1841                                i, q);
1842                        netdev_set_prio_tc_map(dev, i, 0);
1843                }
1844        }
1845}
1846
1847#ifdef CONFIG_XPS
1848static DEFINE_MUTEX(xps_map_mutex);
1849#define xmap_dereference(P)             \
1850        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1851
1852static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1853                                        int cpu, u16 index)
1854{
1855        struct xps_map *map = NULL;
1856        int pos;
1857
1858        if (dev_maps)
1859                map = xmap_dereference(dev_maps->cpu_map[cpu]);
1860
1861        for (pos = 0; map && pos < map->len; pos++) {
1862                if (map->queues[pos] == index) {
1863                        if (map->len > 1) {
1864                                map->queues[pos] = map->queues[--map->len];
1865                        } else {
1866                                RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1867                                kfree_rcu(map, rcu);
1868                                map = NULL;
1869                        }
1870                        break;
1871                }
1872        }
1873
1874        return map;
1875}
1876
1877static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1878{
1879        struct xps_dev_maps *dev_maps;
1880        int cpu, i;
1881        bool active = false;
1882
1883        mutex_lock(&xps_map_mutex);
1884        dev_maps = xmap_dereference(dev->xps_maps);
1885
1886        if (!dev_maps)
1887                goto out_no_maps;
1888
1889        for_each_possible_cpu(cpu) {
1890                for (i = index; i < dev->num_tx_queues; i++) {
1891                        if (!remove_xps_queue(dev_maps, cpu, i))
1892                                break;
1893                }
1894                if (i == dev->num_tx_queues)
1895                        active = true;
1896        }
1897
1898        if (!active) {
1899                RCU_INIT_POINTER(dev->xps_maps, NULL);
1900                kfree_rcu(dev_maps, rcu);
1901        }
1902
1903        for (i = index; i < dev->num_tx_queues; i++)
1904                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1905                                             NUMA_NO_NODE);
1906
1907out_no_maps:
1908        mutex_unlock(&xps_map_mutex);
1909}
1910
1911static struct xps_map *expand_xps_map(struct xps_map *map,
1912                                      int cpu, u16 index)
1913{
1914        struct xps_map *new_map;
1915        int alloc_len = XPS_MIN_MAP_ALLOC;
1916        int i, pos;
1917
1918        for (pos = 0; map && pos < map->len; pos++) {
1919                if (map->queues[pos] != index)
1920                        continue;
1921                return map;
1922        }
1923
1924        /* Need to add queue to this CPU's existing map */
1925        if (map) {
1926                if (pos < map->alloc_len)
1927                        return map;
1928
1929                alloc_len = map->alloc_len * 2;
1930        }
1931
1932        /* Need to allocate new map to store queue on this CPU's map */
1933        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1934                               cpu_to_node(cpu));
1935        if (!new_map)
1936                return NULL;
1937
1938        for (i = 0; i < pos; i++)
1939                new_map->queues[i] = map->queues[i];
1940        new_map->alloc_len = alloc_len;
1941        new_map->len = pos;
1942
1943        return new_map;
1944}
1945
1946int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1947                        u16 index)
1948{
1949        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1950        struct xps_map *map, *new_map;
1951        int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1952        int cpu, numa_node_id = -2;
1953        bool active = false;
1954
1955        mutex_lock(&xps_map_mutex);
1956
1957        dev_maps = xmap_dereference(dev->xps_maps);
1958
1959        /* allocate memory for queue storage */
1960        for_each_online_cpu(cpu) {
1961                if (!cpumask_test_cpu(cpu, mask))
1962                        continue;
1963
1964                if (!new_dev_maps)
1965                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1966                if (!new_dev_maps) {
1967                        mutex_unlock(&xps_map_mutex);
1968                        return -ENOMEM;
1969                }
1970
1971                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1972                                 NULL;
1973
1974                map = expand_xps_map(map, cpu, index);
1975                if (!map)
1976                        goto error;
1977
1978                RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1979        }
1980
1981        if (!new_dev_maps)
1982                goto out_no_new_maps;
1983
1984        for_each_possible_cpu(cpu) {
1985                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1986                        /* add queue to CPU maps */
1987                        int pos = 0;
1988
1989                        map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1990                        while ((pos < map->len) && (map->queues[pos] != index))
1991                                pos++;
1992
1993                        if (pos == map->len)
1994                                map->queues[map->len++] = index;
1995#ifdef CONFIG_NUMA
1996                        if (numa_node_id == -2)
1997                                numa_node_id = cpu_to_node(cpu);
1998                        else if (numa_node_id != cpu_to_node(cpu))
1999                                numa_node_id = -1;
2000#endif
2001                } else if (dev_maps) {
2002                        /* fill in the new device map from the old device map */
2003                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2004                        RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2005                }
2006
2007        }
2008
2009        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2010
2011        /* Cleanup old maps */
2012        if (dev_maps) {
2013                for_each_possible_cpu(cpu) {
2014                        new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2015                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2016                        if (map && map != new_map)
2017                                kfree_rcu(map, rcu);
2018                }
2019
2020                kfree_rcu(dev_maps, rcu);
2021        }
2022
2023        dev_maps = new_dev_maps;
2024        active = true;
2025
2026out_no_new_maps:
2027        /* update Tx queue numa node */
2028        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2029                                     (numa_node_id >= 0) ? numa_node_id :
2030                                     NUMA_NO_NODE);
2031
2032        if (!dev_maps)
2033                goto out_no_maps;
2034
2035        /* removes queue from unused CPUs */
2036        for_each_possible_cpu(cpu) {
2037                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2038                        continue;
2039
2040                if (remove_xps_queue(dev_maps, cpu, index))
2041                        active = true;
2042        }
2043
2044        /* free map if not active */
2045        if (!active) {
2046                RCU_INIT_POINTER(dev->xps_maps, NULL);
2047                kfree_rcu(dev_maps, rcu);
2048        }
2049
2050out_no_maps:
2051        mutex_unlock(&xps_map_mutex);
2052
2053        return 0;
2054error:
2055        /* remove any maps that we added */
2056        for_each_possible_cpu(cpu) {
2057                new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2058                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2059                                 NULL;
2060                if (new_map && new_map != map)
2061                        kfree(new_map);
2062        }
2063
2064        mutex_unlock(&xps_map_mutex);
2065
2066        kfree(new_dev_maps);
2067        return -ENOMEM;
2068}
2069EXPORT_SYMBOL(netif_set_xps_queue);
2070
2071#endif
2072/*
2073 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2074 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2075 */
2076int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2077{
2078        int rc;
2079
2080        if (txq < 1 || txq > dev->num_tx_queues)
2081                return -EINVAL;
2082
2083        if (dev->reg_state == NETREG_REGISTERED ||
2084            dev->reg_state == NETREG_UNREGISTERING) {
2085                ASSERT_RTNL();
2086
2087                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2088                                                  txq);
2089                if (rc)
2090                        return rc;
2091
2092                if (dev->num_tc)
2093                        netif_setup_tc(dev, txq);
2094
2095                if (txq < dev->real_num_tx_queues) {
2096                        qdisc_reset_all_tx_gt(dev, txq);
2097#ifdef CONFIG_XPS
2098                        netif_reset_xps_queues_gt(dev, txq);
2099#endif
2100                }
2101        }
2102
2103        dev->real_num_tx_queues = txq;
2104        return 0;
2105}
2106EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2107
2108#ifdef CONFIG_SYSFS
2109/**
2110 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2111 *      @dev: Network device
2112 *      @rxq: Actual number of RX queues
2113 *
2114 *      This must be called either with the rtnl_lock held or before
2115 *      registration of the net device.  Returns 0 on success, or a
2116 *      negative error code.  If called before registration, it always
2117 *      succeeds.
2118 */
2119int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2120{
2121        int rc;
2122
2123        if (rxq < 1 || rxq > dev->num_rx_queues)
2124                return -EINVAL;
2125
2126        if (dev->reg_state == NETREG_REGISTERED) {
2127                ASSERT_RTNL();
2128
2129                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2130                                                  rxq);
2131                if (rc)
2132                        return rc;
2133        }
2134
2135        dev->real_num_rx_queues = rxq;
2136        return 0;
2137}
2138EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2139#endif
2140
2141/**
2142 * netif_get_num_default_rss_queues - default number of RSS queues
2143 *
2144 * This routine should set an upper limit on the number of RSS queues
2145 * used by default by multiqueue devices.
2146 */
2147int netif_get_num_default_rss_queues(void)
2148{
2149        return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2150}
2151EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2152
2153static inline void __netif_reschedule(struct Qdisc *q)
2154{
2155        struct softnet_data *sd;
2156        unsigned long flags;
2157
2158        local_irq_save(flags);
2159        sd = this_cpu_ptr(&softnet_data);
2160        q->next_sched = NULL;
2161        *sd->output_queue_tailp = q;
2162        sd->output_queue_tailp = &q->next_sched;
2163        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2164        local_irq_restore(flags);
2165}
2166
2167void __netif_schedule(struct Qdisc *q)
2168{
2169        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2170                __netif_reschedule(q);
2171}
2172EXPORT_SYMBOL(__netif_schedule);
2173
2174struct dev_kfree_skb_cb {
2175        enum skb_free_reason reason;
2176};
2177
2178static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2179{
2180        return (struct dev_kfree_skb_cb *)skb->cb;
2181}
2182
2183void netif_schedule_queue(struct netdev_queue *txq)
2184{
2185        rcu_read_lock();
2186        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2187                struct Qdisc *q = rcu_dereference(txq->qdisc);
2188
2189                __netif_schedule(q);
2190        }
2191        rcu_read_unlock();
2192}
2193EXPORT_SYMBOL(netif_schedule_queue);
2194
2195/**
2196 *      netif_wake_subqueue - allow sending packets on subqueue
2197 *      @dev: network device
2198 *      @queue_index: sub queue index
2199 *
2200 * Resume individual transmit queue of a device with multiple transmit queues.
2201 */
2202void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2203{
2204        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2205
2206        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2207                struct Qdisc *q;
2208
2209                rcu_read_lock();
2210                q = rcu_dereference(txq->qdisc);
2211                __netif_schedule(q);
2212                rcu_read_unlock();
2213        }
2214}
2215EXPORT_SYMBOL(netif_wake_subqueue);
2216
2217void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2218{
2219        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2220                struct Qdisc *q;
2221
2222                rcu_read_lock();
2223                q = rcu_dereference(dev_queue->qdisc);
2224                __netif_schedule(q);
2225                rcu_read_unlock();
2226        }
2227}
2228EXPORT_SYMBOL(netif_tx_wake_queue);
2229
2230void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2231{
2232        unsigned long flags;
2233
2234        if (likely(atomic_read(&skb->users) == 1)) {
2235                smp_rmb();
2236                atomic_set(&skb->users, 0);
2237        } else if (likely(!atomic_dec_and_test(&skb->users))) {
2238                return;
2239        }
2240        get_kfree_skb_cb(skb)->reason = reason;
2241        local_irq_save(flags);
2242        skb->next = __this_cpu_read(softnet_data.completion_queue);
2243        __this_cpu_write(softnet_data.completion_queue, skb);
2244        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2245        local_irq_restore(flags);
2246}
2247EXPORT_SYMBOL(__dev_kfree_skb_irq);
2248
2249void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2250{
2251        if (in_irq() || irqs_disabled())
2252                __dev_kfree_skb_irq(skb, reason);
2253        else
2254                dev_kfree_skb(skb);
2255}
2256EXPORT_SYMBOL(__dev_kfree_skb_any);
2257
2258
2259/**
2260 * netif_device_detach - mark device as removed
2261 * @dev: network device
2262 *
2263 * Mark device as removed from system and therefore no longer available.
2264 */
2265void netif_device_detach(struct net_device *dev)
2266{
2267        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2268            netif_running(dev)) {
2269                netif_tx_stop_all_queues(dev);
2270        }
2271}
2272EXPORT_SYMBOL(netif_device_detach);
2273
2274/**
2275 * netif_device_attach - mark device as attached
2276 * @dev: network device
2277 *
2278 * Mark device as attached from system and restart if needed.
2279 */
2280void netif_device_attach(struct net_device *dev)
2281{
2282        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2283            netif_running(dev)) {
2284                netif_tx_wake_all_queues(dev);
2285                __netdev_watchdog_up(dev);
2286        }
2287}
2288EXPORT_SYMBOL(netif_device_attach);
2289
2290static void skb_warn_bad_offload(const struct sk_buff *skb)
2291{
2292        static const netdev_features_t null_features = 0;
2293        struct net_device *dev = skb->dev;
2294        const char *driver = "";
2295
2296        if (!net_ratelimit())
2297                return;
2298
2299        if (dev && dev->dev.parent)
2300                driver = dev_driver_string(dev->dev.parent);
2301
2302        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2303             "gso_type=%d ip_summed=%d\n",
2304             driver, dev ? &dev->features : &null_features,
2305             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2306             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2307             skb_shinfo(skb)->gso_type, skb->ip_summed);
2308}
2309
2310/*
2311 * Invalidate hardware checksum when packet is to be mangled, and
2312 * complete checksum manually on outgoing path.
2313 */
2314int skb_checksum_help(struct sk_buff *skb)
2315{
2316        __wsum csum;
2317        int ret = 0, offset;
2318
2319        if (skb->ip_summed == CHECKSUM_COMPLETE)
2320                goto out_set_summed;
2321
2322        if (unlikely(skb_shinfo(skb)->gso_size)) {
2323                skb_warn_bad_offload(skb);
2324                return -EINVAL;
2325        }
2326
2327        /* Before computing a checksum, we should make sure no frag could
2328         * be modified by an external entity : checksum could be wrong.
2329         */
2330        if (skb_has_shared_frag(skb)) {
2331                ret = __skb_linearize(skb);
2332                if (ret)
2333                        goto out;
2334        }
2335
2336        offset = skb_checksum_start_offset(skb);
2337        BUG_ON(offset >= skb_headlen(skb));
2338        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2339
2340        offset += skb->csum_offset;
2341        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2342
2343        if (skb_cloned(skb) &&
2344            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2345                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2346                if (ret)
2347                        goto out;
2348        }
2349
2350        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2351out_set_summed:
2352        skb->ip_summed = CHECKSUM_NONE;
2353out:
2354        return ret;
2355}
2356EXPORT_SYMBOL(skb_checksum_help);
2357
2358__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2359{
2360        unsigned int vlan_depth = skb->mac_len;
2361        __be16 type = skb->protocol;
2362
2363        /* Tunnel gso handlers can set protocol to ethernet. */
2364        if (type == htons(ETH_P_TEB)) {
2365                struct ethhdr *eth;
2366
2367                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2368                        return 0;
2369
2370                eth = (struct ethhdr *)skb_mac_header(skb);
2371                type = eth->h_proto;
2372        }
2373
2374        /* if skb->protocol is 802.1Q/AD then the header should already be
2375         * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2376         * ETH_HLEN otherwise
2377         */
2378        if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2379                if (vlan_depth) {
2380                        if (WARN_ON(vlan_depth < VLAN_HLEN))
2381                                return 0;
2382                        vlan_depth -= VLAN_HLEN;
2383                } else {
2384                        vlan_depth = ETH_HLEN;
2385                }
2386                do {
2387                        struct vlan_hdr *vh;
2388
2389                        if (unlikely(!pskb_may_pull(skb,
2390                                                    vlan_depth + VLAN_HLEN)))
2391                                return 0;
2392
2393                        vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2394                        type = vh->h_vlan_encapsulated_proto;
2395                        vlan_depth += VLAN_HLEN;
2396                } while (type == htons(ETH_P_8021Q) ||
2397                         type == htons(ETH_P_8021AD));
2398        }
2399
2400        *depth = vlan_depth;
2401
2402        return type;
2403}
2404
2405/**
2406 *      skb_mac_gso_segment - mac layer segmentation handler.
2407 *      @skb: buffer to segment
2408 *      @features: features for the output path (see dev->features)
2409 */
2410struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2411                                    netdev_features_t features)
2412{
2413        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2414        struct packet_offload *ptype;
2415        int vlan_depth = skb->mac_len;
2416        __be16 type = skb_network_protocol(skb, &vlan_depth);
2417
2418        if (unlikely(!type))
2419                return ERR_PTR(-EINVAL);
2420
2421        __skb_pull(skb, vlan_depth);
2422
2423        rcu_read_lock();
2424        list_for_each_entry_rcu(ptype, &offload_base, list) {
2425                if (ptype->type == type && ptype->callbacks.gso_segment) {
2426                        segs = ptype->callbacks.gso_segment(skb, features);
2427                        break;
2428                }
2429        }
2430        rcu_read_unlock();
2431
2432        __skb_push(skb, skb->data - skb_mac_header(skb));
2433
2434        return segs;
2435}
2436EXPORT_SYMBOL(skb_mac_gso_segment);
2437
2438
2439/* openvswitch calls this on rx path, so we need a different check.
2440 */
2441static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2442{
2443        if (tx_path)
2444                return skb->ip_summed != CHECKSUM_PARTIAL;
2445        else
2446                return skb->ip_summed == CHECKSUM_NONE;
2447}
2448
2449/**
2450 *      __skb_gso_segment - Perform segmentation on skb.
2451 *      @skb: buffer to segment
2452 *      @features: features for the output path (see dev->features)
2453 *      @tx_path: whether it is called in TX path
2454 *
2455 *      This function segments the given skb and returns a list of segments.
2456 *
2457 *      It may return NULL if the skb requires no segmentation.  This is
2458 *      only possible when GSO is used for verifying header integrity.
2459 */
2460struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2461                                  netdev_features_t features, bool tx_path)
2462{
2463        if (unlikely(skb_needs_check(skb, tx_path))) {
2464                int err;
2465
2466                skb_warn_bad_offload(skb);
2467
2468                err = skb_cow_head(skb, 0);
2469                if (err < 0)
2470                        return ERR_PTR(err);
2471        }
2472
2473        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2474        SKB_GSO_CB(skb)->encap_level = 0;
2475
2476        skb_reset_mac_header(skb);
2477        skb_reset_mac_len(skb);
2478
2479        return skb_mac_gso_segment(skb, features);
2480}
2481EXPORT_SYMBOL(__skb_gso_segment);
2482
2483/* Take action when hardware reception checksum errors are detected. */
2484#ifdef CONFIG_BUG
2485void netdev_rx_csum_fault(struct net_device *dev)
2486{
2487        if (net_ratelimit()) {
2488                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2489                dump_stack();
2490        }
2491}
2492EXPORT_SYMBOL(netdev_rx_csum_fault);
2493#endif
2494
2495/* Actually, we should eliminate this check as soon as we know, that:
2496 * 1. IOMMU is present and allows to map all the memory.
2497 * 2. No high memory really exists on this machine.
2498 */
2499
2500static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2501{
2502#ifdef CONFIG_HIGHMEM
2503        int i;
2504        if (!(dev->features & NETIF_F_HIGHDMA)) {
2505                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2506                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2507                        if (PageHighMem(skb_frag_page(frag)))
2508                                return 1;
2509                }
2510        }
2511
2512        if (PCI_DMA_BUS_IS_PHYS) {
2513                struct device *pdev = dev->dev.parent;
2514
2515                if (!pdev)
2516                        return 0;
2517                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2518                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2519                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2520                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2521                                return 1;
2522                }
2523        }
2524#endif
2525        return 0;
2526}
2527
2528/* If MPLS offload request, verify we are testing hardware MPLS features
2529 * instead of standard features for the netdev.
2530 */
2531#ifdef CONFIG_NET_MPLS_GSO
2532static netdev_features_t net_mpls_features(struct sk_buff *skb,
2533                                           netdev_features_t features,
2534                                           __be16 type)
2535{
2536        if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2537                features &= skb->dev->mpls_features;
2538
2539        return features;
2540}
2541#else
2542static netdev_features_t net_mpls_features(struct sk_buff *skb,
2543                                           netdev_features_t features,
2544                                           __be16 type)
2545{
2546        return features;
2547}
2548#endif
2549
2550static netdev_features_t harmonize_features(struct sk_buff *skb,
2551        netdev_features_t features)
2552{
2553        int tmp;
2554        __be16 type;
2555
2556        type = skb_network_protocol(skb, &tmp);
2557        features = net_mpls_features(skb, features, type);
2558
2559        if (skb->ip_summed != CHECKSUM_NONE &&
2560            !can_checksum_protocol(features, type)) {
2561                features &= ~NETIF_F_ALL_CSUM;
2562        } else if (illegal_highdma(skb->dev, skb)) {
2563                features &= ~NETIF_F_SG;
2564        }
2565
2566        return features;
2567}
2568
2569netdev_features_t netif_skb_features(struct sk_buff *skb)
2570{
2571        struct net_device *dev = skb->dev;
2572        netdev_features_t features = dev->features;
2573        u16 gso_segs = skb_shinfo(skb)->gso_segs;
2574        __be16 protocol = skb->protocol;
2575
2576        if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2577                features &= ~NETIF_F_GSO_MASK;
2578
2579        /* If encapsulation offload request, verify we are testing
2580         * hardware encapsulation features instead of standard
2581         * features for the netdev
2582         */
2583        if (skb->encapsulation)
2584                features &= dev->hw_enc_features;
2585
2586        if (!vlan_tx_tag_present(skb)) {
2587                if (unlikely(protocol == htons(ETH_P_8021Q) ||
2588                             protocol == htons(ETH_P_8021AD))) {
2589                        struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2590                        protocol = veh->h_vlan_encapsulated_proto;
2591                } else {
2592                        goto finalize;
2593                }
2594        }
2595
2596        features = netdev_intersect_features(features,
2597                                             dev->vlan_features |
2598                                             NETIF_F_HW_VLAN_CTAG_TX |
2599                                             NETIF_F_HW_VLAN_STAG_TX);
2600
2601        if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2602                features = netdev_intersect_features(features,
2603                                                     NETIF_F_SG |
2604                                                     NETIF_F_HIGHDMA |
2605                                                     NETIF_F_FRAGLIST |
2606                                                     NETIF_F_GEN_CSUM |
2607                                                     NETIF_F_HW_VLAN_CTAG_TX |
2608                                                     NETIF_F_HW_VLAN_STAG_TX);
2609
2610finalize:
2611        if (dev->netdev_ops->ndo_features_check)
2612                features &= dev->netdev_ops->ndo_features_check(skb, dev,
2613                                                                features);
2614
2615        return harmonize_features(skb, features);
2616}
2617EXPORT_SYMBOL(netif_skb_features);
2618
2619static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2620                    struct netdev_queue *txq, bool more)
2621{
2622        unsigned int len;
2623        int rc;
2624
2625#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
2626        if ( !list_empty(&ptype_all) &&
2627                !(skb->imq_flags & IMQ_F_ENQUEUE))
2628#else
2629                if (!list_empty(&ptype_all))
2630#endif
2631                dev_queue_xmit_nit(skb, dev);
2632
2633#ifdef CONFIG_ETHERNET_PACKET_MANGLE
2634                if (!dev->eth_mangle_tx ||
2635                    (skb = dev->eth_mangle_tx(dev, skb)) != NULL)
2636#else
2637                if (1)
2638#endif
2639                {
2640                        len = skb->len;
2641                        trace_net_dev_start_xmit(skb, dev);
2642                        rc = netdev_start_xmit(skb, dev, txq, more);
2643                        trace_net_dev_xmit(skb, rc, dev, len);
2644                }
2645               
2646        return rc;
2647}
2648
2649struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2650                                    struct netdev_queue *txq, int *ret)
2651{
2652        struct sk_buff *skb = first;
2653        int rc = NETDEV_TX_OK;
2654
2655        while (skb) {
2656                struct sk_buff *next = skb->next;
2657
2658                skb->next = NULL;
2659                rc = xmit_one(skb, dev, txq, next != NULL);
2660                if (unlikely(!dev_xmit_complete(rc))) {
2661                        skb->next = next;
2662                        goto out;
2663                }
2664
2665                skb = next;
2666                if (netif_xmit_stopped(txq) && skb) {
2667                        rc = NETDEV_TX_BUSY;
2668                        break;
2669                }
2670        }
2671
2672out:
2673        *ret = rc;
2674        return skb;
2675}
2676
2677static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2678                                          netdev_features_t features)
2679{
2680        if (vlan_tx_tag_present(skb) &&
2681            !vlan_hw_offload_capable(features, skb->vlan_proto))
2682                skb = __vlan_hwaccel_push_inside(skb);
2683        return skb;
2684}
2685
2686static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2687{
2688        netdev_features_t features;
2689
2690        if (skb->next)
2691                return skb;
2692
2693        features = netif_skb_features(skb);
2694        skb = validate_xmit_vlan(skb, features);
2695        if (unlikely(!skb))
2696                goto out_null;
2697
2698        if (netif_needs_gso(dev, skb, features)) {
2699                struct sk_buff *segs;
2700
2701                segs = skb_gso_segment(skb, features);
2702                if (IS_ERR(segs)) {
2703                        goto out_kfree_skb;
2704                } else if (segs) {
2705                        consume_skb(skb);
2706                        skb = segs;
2707                }
2708        } else {
2709                if (skb_needs_linearize(skb, features) &&
2710                    __skb_linearize(skb))
2711                        goto out_kfree_skb;
2712
2713                /* If packet is not checksummed and device does not
2714                 * support checksumming for this protocol, complete
2715                 * checksumming here.
2716                 */
2717                if (skb->ip_summed == CHECKSUM_PARTIAL) {
2718                        if (skb->encapsulation)
2719                                skb_set_inner_transport_header(skb,
2720                                                               skb_checksum_start_offset(skb));
2721                        else
2722                                skb_set_transport_header(skb,
2723                                                         skb_checksum_start_offset(skb));
2724                        if (!(features & NETIF_F_ALL_CSUM) &&
2725                            skb_checksum_help(skb))
2726                                goto out_kfree_skb;
2727                }
2728        }
2729
2730        return skb;
2731
2732out_kfree_skb:
2733        kfree_skb(skb);
2734out_null:
2735        return NULL;
2736}
2737
2738struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2739{
2740        struct sk_buff *next, *head = NULL, *tail;
2741
2742        for (; skb != NULL; skb = next) {
2743                next = skb->next;
2744                skb->next = NULL;
2745
2746                /* in case skb wont be segmented, point to itself */
2747                skb->prev = skb;
2748
2749                skb = validate_xmit_skb(skb, dev);
2750                if (!skb)
2751                        continue;
2752
2753                if (!head)
2754                        head = skb;
2755                else
2756                        tail->next = skb;
2757                /* If skb was segmented, skb->prev points to
2758                 * the last segment. If not, it still contains skb.
2759                 */
2760                tail = skb->prev;
2761        }
2762        return head;
2763}
2764EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2765
2766static void qdisc_pkt_len_init(struct sk_buff *skb)
2767{
2768        const struct skb_shared_info *shinfo = skb_shinfo(skb);
2769
2770        qdisc_skb_cb(skb)->pkt_len = skb->len;
2771
2772        /* To get more precise estimation of bytes sent on wire,
2773         * we add to pkt_len the headers size of all segments
2774         */
2775        if (shinfo->gso_size)  {
2776                unsigned int hdr_len;
2777                u16 gso_segs = shinfo->gso_segs;
2778
2779                /* mac layer + network layer */
2780                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2781
2782                /* + transport layer */
2783                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2784                        hdr_len += tcp_hdrlen(skb);
2785                else
2786                        hdr_len += sizeof(struct udphdr);
2787
2788                if (shinfo->gso_type & SKB_GSO_DODGY)
2789                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2790                                                shinfo->gso_size);
2791
2792                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2793        }
2794}
2795
2796static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2797                                 struct net_device *dev,
2798                                 struct netdev_queue *txq)
2799{
2800        spinlock_t *root_lock = qdisc_lock(q);
2801        bool contended;
2802        int rc;
2803
2804        qdisc_pkt_len_init(skb);
2805        qdisc_calculate_pkt_len(skb, q);
2806        /*
2807         * Heuristic to force contended enqueues to serialize on a
2808         * separate lock before trying to get qdisc main lock.
2809         * This permits __QDISC___STATE_RUNNING owner to get the lock more
2810         * often and dequeue packets faster.
2811         */
2812        contended = qdisc_is_running(q);
2813        if (unlikely(contended))
2814                spin_lock(&q->busylock);
2815
2816        spin_lock(root_lock);
2817        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2818                kfree_skb(skb);
2819                rc = NET_XMIT_DROP;
2820        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2821                   qdisc_run_begin(q)) {
2822                /*
2823                 * This is a work-conserving queue; there are no old skbs
2824                 * waiting to be sent out; and the qdisc is not running -
2825                 * xmit the skb directly.
2826                 */
2827
2828                qdisc_bstats_update(q, skb);
2829
2830                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2831                        if (unlikely(contended)) {
2832                                spin_unlock(&q->busylock);
2833                                contended = false;
2834                        }
2835                        __qdisc_run(q);
2836                } else
2837                        qdisc_run_end(q);
2838
2839                rc = NET_XMIT_SUCCESS;
2840        } else {
2841                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2842                if (qdisc_run_begin(q)) {
2843                        if (unlikely(contended)) {
2844                                spin_unlock(&q->busylock);
2845                                contended = false;
2846                        }
2847                        __qdisc_run(q);
2848                }
2849        }
2850        spin_unlock(root_lock);
2851        if (unlikely(contended))
2852                spin_unlock(&q->busylock);
2853        return rc;
2854}
2855
2856#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2857static void skb_update_prio(struct sk_buff *skb)
2858{
2859        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2860
2861        if (!skb->priority && skb->sk && map) {
2862                unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2863
2864                if (prioidx < map->priomap_len)
2865                        skb->priority = map->priomap[prioidx];
2866        }
2867}
2868#else
2869#define skb_update_prio(skb)
2870#endif
2871
2872DEFINE_PER_CPU(int, xmit_recursion);
2873EXPORT_SYMBOL(xmit_recursion);
2874
2875#define RECURSION_LIMIT 10
2876
2877/**
2878 *      dev_loopback_xmit - loop back @skb
2879 *      @skb: buffer to transmit
2880 */
2881int dev_loopback_xmit(struct sk_buff *skb)
2882{
2883        skb_reset_mac_header(skb);
2884        __skb_pull(skb, skb_network_offset(skb));
2885        skb->pkt_type = PACKET_LOOPBACK;
2886        skb->ip_summed = CHECKSUM_UNNECESSARY;
2887        WARN_ON(!skb_dst(skb));
2888        skb_dst_force(skb);
2889        netif_rx_ni(skb);
2890        return 0;
2891}
2892EXPORT_SYMBOL(dev_loopback_xmit);
2893
2894/**
2895 *      __dev_queue_xmit - transmit a buffer
2896 *      @skb: buffer to transmit
2897 *      @accel_priv: private data used for L2 forwarding offload
2898 *
2899 *      Queue a buffer for transmission to a network device. The caller must
2900 *      have set the device and priority and built the buffer before calling
2901 *      this function. The function can be called from an interrupt.
2902 *
2903 *      A negative errno code is returned on a failure. A success does not
2904 *      guarantee the frame will be transmitted as it may be dropped due
2905 *      to congestion or traffic shaping.
2906 *
2907 * -----------------------------------------------------------------------------------
2908 *      I notice this method can also return errors from the queue disciplines,
2909 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2910 *      be positive.
2911 *
2912 *      Regardless of the return value, the skb is consumed, so it is currently
2913 *      difficult to retry a send to this method.  (You can bump the ref count
2914 *      before sending to hold a reference for retry if you are careful.)
2915 *
2916 *      When calling this method, interrupts MUST be enabled.  This is because
2917 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2918 *          --BLG
2919 */
2920static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2921{
2922        struct net_device *dev = skb->dev;
2923        struct netdev_queue *txq;
2924        struct Qdisc *q;
2925        int rc = -ENOMEM;
2926
2927        skb_reset_mac_header(skb);
2928
2929        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2930                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2931
2932        /* Disable soft irqs for various locks below. Also
2933         * stops preemption for RCU.
2934         */
2935        rcu_read_lock_bh();
2936
2937        skb_update_prio(skb);
2938
2939        /* If device/qdisc don't need skb->dst, release it right now while
2940         * its hot in this cpu cache.
2941         */
2942        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2943                skb_dst_drop(skb);
2944        else
2945                skb_dst_force(skb);
2946
2947        txq = netdev_pick_tx(dev, skb, accel_priv);
2948        q = rcu_dereference_bh(txq->qdisc);
2949
2950#ifdef CONFIG_NET_CLS_ACT
2951        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2952#endif
2953        trace_net_dev_queue(skb);
2954        if (q->enqueue) {
2955                rc = __dev_xmit_skb(skb, q, dev, txq);
2956                goto out;
2957        }
2958
2959        /* The device has no queue. Common case for software devices:
2960           loopback, all the sorts of tunnels...
2961
2962           Really, it is unlikely that netif_tx_lock protection is necessary
2963           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2964           counters.)
2965           However, it is possible, that they rely on protection
2966           made by us here.
2967
2968           Check this and shot the lock. It is not prone from deadlocks.
2969           Either shot noqueue qdisc, it is even simpler 8)
2970         */
2971        if (dev->flags & IFF_UP) {
2972                int cpu = smp_processor_id(); /* ok because BHs are off */
2973
2974                if (txq->xmit_lock_owner != cpu) {
2975
2976                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2977                                goto recursion_alert;
2978
2979                        skb = validate_xmit_skb(skb, dev);
2980                        if (!skb)
2981                                goto drop;
2982
2983                        HARD_TX_LOCK(dev, txq, cpu);
2984
2985                        if (!netif_xmit_stopped(txq)) {
2986                                __this_cpu_inc(xmit_recursion);
2987                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2988                                __this_cpu_dec(xmit_recursion);
2989                                if (dev_xmit_complete(rc)) {
2990                                        HARD_TX_UNLOCK(dev, txq);
2991                                        goto out;
2992                                }
2993                        }
2994                        HARD_TX_UNLOCK(dev, txq);
2995                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2996                                             dev->name);
2997                } else {
2998                        /* Recursion is detected! It is possible,
2999                         * unfortunately
3000                         */
3001recursion_alert:
3002                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3003                                             dev->name);
3004                }
3005        }
3006
3007        rc = -ENETDOWN;
3008drop:
3009        rcu_read_unlock_bh();
3010
3011        atomic_long_inc(&dev->tx_dropped);
3012        kfree_skb_list(skb);
3013        return rc;
3014out:
3015        rcu_read_unlock_bh();
3016        return rc;
3017}
3018
3019int dev_queue_xmit(struct sk_buff *skb)
3020{
3021        return __dev_queue_xmit(skb, NULL);
3022}
3023EXPORT_SYMBOL(dev_queue_xmit);
3024
3025int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3026{
3027        return __dev_queue_xmit(skb, accel_priv);
3028}
3029EXPORT_SYMBOL(dev_queue_xmit_accel);
3030
3031
3032/*=======================================================================
3033                        Receiver routines
3034  =======================================================================*/
3035
3036int netdev_max_backlog __read_mostly = 1000;
3037EXPORT_SYMBOL(netdev_max_backlog);
3038
3039int netdev_tstamp_prequeue __read_mostly = 1;
3040int netdev_budget __read_mostly = 300;
3041int weight_p __read_mostly = 64;            /* old backlog weight */
3042
3043/* Called with irq disabled */
3044static inline void ____napi_schedule(struct softnet_data *sd,
3045                                     struct napi_struct *napi)
3046{
3047        list_add_tail(&napi->poll_list, &sd->poll_list);
3048        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3049}
3050
3051#ifdef CONFIG_RPS
3052
3053/* One global table that all flow-based protocols share. */
3054struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3055EXPORT_SYMBOL(rps_sock_flow_table);
3056
3057struct static_key rps_needed __read_mostly;
3058
3059static struct rps_dev_flow *
3060set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3061            struct rps_dev_flow *rflow, u16 next_cpu)
3062{
3063        if (next_cpu != RPS_NO_CPU) {
3064#ifdef CONFIG_RFS_ACCEL
3065                struct netdev_rx_queue *rxqueue;
3066                struct rps_dev_flow_table *flow_table;
3067                struct rps_dev_flow *old_rflow;
3068                u32 flow_id;
3069                u16 rxq_index;
3070                int rc;
3071
3072                /* Should we steer this flow to a different hardware queue? */
3073                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3074                    !(dev->features & NETIF_F_NTUPLE))
3075                        goto out;
3076                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3077                if (rxq_index == skb_get_rx_queue(skb))
3078                        goto out;
3079
3080                rxqueue = dev->_rx + rxq_index;
3081                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3082                if (!flow_table)
3083                        goto out;
3084                flow_id = skb_get_hash(skb) & flow_table->mask;
3085                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3086                                                        rxq_index, flow_id);
3087                if (rc < 0)
3088                        goto out;
3089                old_rflow = rflow;
3090                rflow = &flow_table->flows[flow_id];
3091                rflow->filter = rc;
3092                if (old_rflow->filter == rflow->filter)
3093                        old_rflow->filter = RPS_NO_FILTER;
3094        out:
3095#endif
3096                rflow->last_qtail =
3097                        per_cpu(softnet_data, next_cpu).input_queue_head;
3098        }
3099
3100        rflow->cpu = next_cpu;
3101        return rflow;
3102}
3103
3104/*
3105 * get_rps_cpu is called from netif_receive_skb and returns the target
3106 * CPU from the RPS map of the receiving queue for a given skb.
3107 * rcu_read_lock must be held on entry.
3108 */
3109static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3110                       struct rps_dev_flow **rflowp)
3111{
3112        struct netdev_rx_queue *rxqueue;
3113        struct rps_map *map;
3114        struct rps_dev_flow_table *flow_table;
3115        struct rps_sock_flow_table *sock_flow_table;
3116        int cpu = -1;
3117        u16 tcpu;
3118        u32 hash;
3119
3120        if (skb_rx_queue_recorded(skb)) {
3121                u16 index = skb_get_rx_queue(skb);
3122                if (unlikely(index >= dev->real_num_rx_queues)) {
3123                        WARN_ONCE(dev->real_num_rx_queues > 1,
3124                                  "%s received packet on queue %u, but number "
3125                                  "of RX queues is %u\n",
3126                                  dev->name, index, dev->real_num_rx_queues);
3127                        goto done;
3128                }
3129                rxqueue = dev->_rx + index;
3130        } else
3131                rxqueue = dev->_rx;
3132
3133        map = rcu_dereference(rxqueue->rps_map);
3134        if (map) {
3135                if (map->len == 1 &&
3136                    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3137                        tcpu = map->cpus[0];
3138                        if (cpu_online(tcpu))
3139                                cpu = tcpu;
3140                        goto done;
3141                }
3142        } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3143                goto done;
3144        }
3145
3146        skb_reset_network_header(skb);
3147        hash = skb_get_hash(skb);
3148        if (!hash)
3149                goto done;
3150
3151        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3152        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3153        if (flow_table && sock_flow_table) {
3154                u16 next_cpu;
3155                struct rps_dev_flow *rflow;
3156
3157                rflow = &flow_table->flows[hash & flow_table->mask];
3158                tcpu = rflow->cpu;
3159
3160                next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3161
3162                /*
3163                 * If the desired CPU (where last recvmsg was done) is
3164                 * different from current CPU (one in the rx-queue flow
3165                 * table entry), switch if one of the following holds:
3166                 *   - Current CPU is unset (equal to RPS_NO_CPU).
3167                 *   - Current CPU is offline.
3168                 *   - The current CPU's queue tail has advanced beyond the
3169                 *     last packet that was enqueued using this table entry.
3170                 *     This guarantees that all previous packets for the flow
3171                 *     have been dequeued, thus preserving in order delivery.
3172                 */
3173                if (unlikely(tcpu != next_cpu) &&
3174                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3175                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3176                      rflow->last_qtail)) >= 0)) {
3177                        tcpu = next_cpu;
3178                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3179                }
3180
3181                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3182                        *rflowp = rflow;
3183                        cpu = tcpu;
3184                        goto done;
3185                }
3186        }
3187
3188        if (map) {
3189                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3190                if (cpu_online(tcpu)) {
3191                        cpu = tcpu;
3192                        goto done;
3193                }
3194        }
3195
3196done:
3197        return cpu;
3198}
3199
3200#ifdef CONFIG_RFS_ACCEL
3201
3202/**
3203 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3204 * @dev: Device on which the filter was set
3205 * @rxq_index: RX queue index
3206 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3207 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3208 *
3209 * Drivers that implement ndo_rx_flow_steer() should periodically call
3210 * this function for each installed filter and remove the filters for
3211 * which it returns %true.
3212 */
3213bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3214                         u32 flow_id, u16 filter_id)
3215{
3216        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3217        struct rps_dev_flow_table *flow_table;
3218        struct rps_dev_flow *rflow;
3219        bool expire = true;
3220        int cpu;
3221
3222        rcu_read_lock();
3223        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3224        if (flow_table && flow_id <= flow_table->mask) {
3225                rflow = &flow_table->flows[flow_id];
3226                cpu = ACCESS_ONCE(rflow->cpu);
3227                if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3228                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3229                           rflow->last_qtail) <
3230                     (int)(10 * flow_table->mask)))
3231                        expire = false;
3232        }
3233        rcu_read_unlock();
3234        return expire;
3235}
3236EXPORT_SYMBOL(rps_may_expire_flow);
3237
3238#endif /* CONFIG_RFS_ACCEL */
3239
3240/* Called from hardirq (IPI) context */
3241static void rps_trigger_softirq(void *data)
3242{
3243        struct softnet_data *sd = data;
3244
3245        ____napi_schedule(sd, &sd->backlog);
3246        sd->received_rps++;
3247}
3248
3249#endif /* CONFIG_RPS */
3250
3251/*
3252 * Check if this softnet_data structure is another cpu one
3253 * If yes, queue it to our IPI list and return 1
3254 * If no, return 0
3255 */
3256static int rps_ipi_queued(struct softnet_data *sd)
3257{
3258#ifdef CONFIG_RPS
3259        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3260
3261        if (sd != mysd) {
3262                sd->rps_ipi_next = mysd->rps_ipi_list;
3263                mysd->rps_ipi_list = sd;
3264
3265                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3266                return 1;
3267        }
3268#endif /* CONFIG_RPS */
3269        return 0;
3270}
3271
3272#ifdef CONFIG_NET_FLOW_LIMIT
3273int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3274#endif
3275
3276static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3277{
3278#ifdef CONFIG_NET_FLOW_LIMIT
3279        struct sd_flow_limit *fl;
3280        struct softnet_data *sd;
3281        unsigned int old_flow, new_flow;
3282
3283        if (qlen < (netdev_max_backlog >> 1))
3284                return false;
3285
3286        sd = this_cpu_ptr(&softnet_data);
3287
3288        rcu_read_lock();
3289        fl = rcu_dereference(sd->flow_limit);
3290        if (fl) {
3291                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3292                old_flow = fl->history[fl->history_head];
3293                fl->history[fl->history_head] = new_flow;
3294
3295                fl->history_head++;
3296                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3297
3298                if (likely(fl->buckets[old_flow]))
3299                        fl->buckets[old_flow]--;
3300
3301                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3302                        fl->count++;
3303                        rcu_read_unlock();
3304                        return true;
3305                }
3306        }
3307        rcu_read_unlock();
3308#endif
3309        return false;
3310}
3311
3312/*
3313 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3314 * queue (may be a remote CPU queue).
3315 */
3316static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3317                              unsigned int *qtail)
3318{
3319        struct softnet_data *sd;
3320        unsigned long flags;
3321        unsigned int qlen;
3322
3323        sd = &per_cpu(softnet_data, cpu);
3324
3325        local_irq_save(flags);
3326
3327        rps_lock(sd);
3328        if (!netif_running(skb->dev))
3329                goto drop;
3330        qlen = skb_queue_len(&sd->input_pkt_queue);
3331        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3332                if (skb_queue_len(&sd->input_pkt_queue)) {
3333enqueue:
3334                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3335                        input_queue_tail_incr_save(sd, qtail);
3336                        rps_unlock(sd);
3337                        local_irq_restore(flags);
3338                        return NET_RX_SUCCESS;
3339                }
3340
3341                /* Schedule NAPI for backlog device
3342                 * We can use non atomic operation since we own the queue lock
3343                 */
3344                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3345                        if (!rps_ipi_queued(sd))
3346                                ____napi_schedule(sd, &sd->backlog);
3347                }
3348                goto enqueue;
3349        }
3350
3351drop:
3352        sd->dropped++;
3353        rps_unlock(sd);
3354
3355        local_irq_restore(flags);
3356
3357        atomic_long_inc(&skb->dev->rx_dropped);
3358        kfree_skb(skb);
3359        return NET_RX_DROP;
3360}
3361
3362static int netif_rx_internal(struct sk_buff *skb)
3363{
3364        int ret;
3365
3366        net_timestamp_check(netdev_tstamp_prequeue, skb);
3367
3368        trace_netif_rx(skb);
3369#ifdef CONFIG_RPS
3370        if (static_key_false(&rps_needed)) {
3371                struct rps_dev_flow voidflow, *rflow = &voidflow;
3372                int cpu;
3373
3374                preempt_disable();
3375                rcu_read_lock();
3376
3377                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3378                if (cpu < 0)
3379                        cpu = smp_processor_id();
3380
3381                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3382
3383                rcu_read_unlock();
3384                preempt_enable();
3385        } else
3386#endif
3387        {
3388                unsigned int qtail;
3389                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3390                put_cpu();
3391        }
3392        return ret;
3393}
3394
3395/**
3396 *      netif_rx        -       post buffer to the network code
3397 *      @skb: buffer to post
3398 *
3399 *      This function receives a packet from a device driver and queues it for
3400 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3401 *      may be dropped during processing for congestion control or by the
3402 *      protocol layers.
3403 *
3404 *      return values:
3405 *      NET_RX_SUCCESS  (no congestion)
3406 *      NET_RX_DROP     (packet was dropped)
3407 *
3408 */
3409
3410int netif_rx(struct sk_buff *skb)
3411{
3412        trace_netif_rx_entry(skb);
3413
3414        return netif_rx_internal(skb);
3415}
3416EXPORT_SYMBOL(netif_rx);
3417
3418int netif_rx_ni(struct sk_buff *skb)
3419{
3420        int err;
3421
3422        trace_netif_rx_ni_entry(skb);
3423
3424        preempt_disable();
3425        err = netif_rx_internal(skb);
3426        if (local_softirq_pending())
3427                do_softirq();
3428        preempt_enable();
3429
3430        return err;
3431}
3432EXPORT_SYMBOL(netif_rx_ni);
3433
3434static void net_tx_action(struct softirq_action *h)
3435{
3436        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3437
3438        if (sd->completion_queue) {
3439                struct sk_buff *clist;
3440
3441                local_irq_disable();
3442                clist = sd->completion_queue;
3443                sd->completion_queue = NULL;
3444                local_irq_enable();
3445
3446                while (clist) {
3447                        struct sk_buff *skb = clist;
3448                        clist = clist->next;
3449
3450                        WARN_ON(atomic_read(&skb->users));
3451                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3452                                trace_consume_skb(skb);
3453                        else
3454                                trace_kfree_skb(skb, net_tx_action);
3455                        __kfree_skb(skb);
3456                }
3457        }
3458
3459        if (sd->output_queue) {
3460                struct Qdisc *head;
3461
3462                local_irq_disable();
3463                head = sd->output_queue;
3464                sd->output_queue = NULL;
3465                sd->output_queue_tailp = &sd->output_queue;
3466                local_irq_enable();
3467
3468                while (head) {
3469                        struct Qdisc *q = head;
3470                        spinlock_t *root_lock;
3471
3472                        head = head->next_sched;
3473
3474                        root_lock = qdisc_lock(q);
3475                        if (spin_trylock(root_lock)) {
3476                                smp_mb__before_atomic();
3477                                clear_bit(__QDISC_STATE_SCHED,
3478                                          &q->state);
3479                                qdisc_run(q);
3480                                spin_unlock(root_lock);
3481                        } else {
3482                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3483                                              &q->state)) {
3484                                        __netif_reschedule(q);
3485                                } else {
3486                                        smp_mb__before_atomic();
3487                                        clear_bit(__QDISC_STATE_SCHED,
3488                                                  &q->state);
3489                                }
3490                        }
3491                }
3492        }
3493}
3494
3495#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3496    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3497/* This hook is defined here for ATM LANE */
3498int (*br_fdb_test_addr_hook)(struct net_device *dev,
3499                             unsigned char *addr) __read_mostly;
3500EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3501#endif
3502
3503#ifdef CONFIG_NET_CLS_ACT
3504/* TODO: Maybe we should just force sch_ingress to be compiled in
3505 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3506 * a compare and 2 stores extra right now if we dont have it on
3507 * but have CONFIG_NET_CLS_ACT
3508 * NOTE: This doesn't stop any functionality; if you dont have
3509 * the ingress scheduler, you just can't add policies on ingress.
3510 *
3511 */
3512static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3513{
3514        struct net_device *dev = skb->dev;
3515        u32 ttl = G_TC_RTTL(skb->tc_verd);
3516        int result = TC_ACT_OK;
3517        struct Qdisc *q;
3518
3519        if (unlikely(MAX_RED_LOOP < ttl++)) {
3520                net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3521                                     skb->skb_iif, dev->ifindex);
3522                return TC_ACT_SHOT;
3523        }
3524
3525        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3526        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3527
3528        q = rcu_dereference(rxq->qdisc);
3529        if (q != &noop_qdisc) {
3530                spin_lock(qdisc_lock(q));
3531                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3532                        result = qdisc_enqueue_root(skb, q);
3533                spin_unlock(qdisc_lock(q));
3534        }
3535
3536        return result;
3537}
3538
3539static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3540                                         struct packet_type **pt_prev,
3541                                         int *ret, struct net_device *orig_dev)
3542{
3543        struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3544
3545        if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3546                goto out;
3547
3548        if (*pt_prev) {
3549                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3550                *pt_prev = NULL;
3551        }
3552
3553        switch (ing_filter(skb, rxq)) {
3554        case TC_ACT_SHOT:
3555        case TC_ACT_STOLEN:
3556                kfree_skb(skb);
3557                return NULL;
3558        }
3559
3560out:
3561        skb->tc_verd = 0;
3562        return skb;
3563}
3564#endif
3565
3566/**
3567 *      netdev_rx_handler_register - register receive handler
3568 *      @dev: device to register a handler for
3569 *      @rx_handler: receive handler to register
3570 *      @rx_handler_data: data pointer that is used by rx handler
3571 *
3572 *      Register a receive handler for a device. This handler will then be
3573 *      called from __netif_receive_skb. A negative errno code is returned
3574 *      on a failure.
3575 *
3576 *      The caller must hold the rtnl_mutex.
3577 *
3578 *      For a general description of rx_handler, see enum rx_handler_result.
3579 */
3580int netdev_rx_handler_register(struct net_device *dev,
3581                               rx_handler_func_t *rx_handler,
3582                               void *rx_handler_data)
3583{
3584        ASSERT_RTNL();
3585
3586        if (dev->rx_handler)
3587                return -EBUSY;
3588
3589        /* Note: rx_handler_data must be set before rx_handler */
3590        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3591        rcu_assign_pointer(dev->rx_handler, rx_handler);
3592
3593        return 0;
3594}
3595EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3596
3597/**
3598 *      netdev_rx_handler_unregister - unregister receive handler
3599 *      @dev: device to unregister a handler from
3600 *
3601 *      Unregister a receive handler from a device.
3602 *
3603 *      The caller must hold the rtnl_mutex.
3604 */
3605void netdev_rx_handler_unregister(struct net_device *dev)
3606{
3607
3608        ASSERT_RTNL();
3609        RCU_INIT_POINTER(dev->rx_handler, NULL);
3610        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3611         * section has a guarantee to see a non NULL rx_handler_data
3612         * as well.
3613         */
3614        synchronize_net();
3615        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3616}
3617EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3618
3619/*
3620 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3621 * the special handling of PFMEMALLOC skbs.
3622 */
3623static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3624{
3625        switch (skb->protocol) {
3626        case htons(ETH_P_ARP):
3627        case htons(ETH_P_IP):
3628        case htons(ETH_P_IPV6):
3629        case htons(ETH_P_8021Q):
3630        case htons(ETH_P_8021AD):
3631                return true;
3632        default:
3633                return false;
3634        }
3635}
3636
3637static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3638{
3639        struct packet_type *ptype, *pt_prev;
3640        rx_handler_func_t *rx_handler;
3641        struct net_device *orig_dev;
3642        struct net_device *null_or_dev;
3643        bool deliver_exact = false;
3644        int ret = NET_RX_DROP;
3645        __be16 type;
3646
3647        net_timestamp_check(!netdev_tstamp_prequeue, skb);
3648
3649        trace_netif_receive_skb(skb);
3650
3651
3652        orig_dev = skb->dev;
3653
3654        skb_reset_network_header(skb);
3655        if (!skb_transport_header_was_set(skb))
3656                skb_reset_transport_header(skb);
3657        skb_reset_mac_len(skb);
3658
3659        pt_prev = NULL;
3660
3661another_round:
3662        skb->skb_iif = skb->dev->ifindex;
3663
3664        __this_cpu_inc(softnet_data.processed);
3665
3666        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3667            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3668                skb = skb_vlan_untag(skb);
3669                if (unlikely(!skb))
3670                        goto out;
3671        }
3672
3673#ifdef CONFIG_NET_CLS_ACT
3674        if (skb->tc_verd & TC_NCLS) {
3675                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3676                goto ncls;
3677        }
3678#endif
3679
3680        if (pfmemalloc)
3681                goto skip_taps;
3682
3683        list_for_each_entry_rcu(ptype, &ptype_all, list) {
3684                if (!ptype->dev || ptype->dev == skb->dev) {
3685                        if (pt_prev)
3686                                ret = deliver_skb(skb, pt_prev, orig_dev);
3687                        pt_prev = ptype;
3688                }
3689        }
3690
3691skip_taps:
3692#ifdef CONFIG_NET_CLS_ACT
3693        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3694        if (!skb)
3695                goto out;
3696ncls:
3697#endif
3698
3699        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3700                goto drop;
3701
3702        if (vlan_tx_tag_present(skb)) {
3703                if (pt_prev) {
3704                        ret = deliver_skb(skb, pt_prev, orig_dev);
3705                        pt_prev = NULL;
3706                }
3707                if (vlan_do_receive(&skb))
3708                        goto another_round;
3709                else if (unlikely(!skb))
3710                        goto out;
3711        }
3712
3713        rx_handler = rcu_dereference(skb->dev->rx_handler);
3714        if (rx_handler) {
3715                if (pt_prev) {
3716                        ret = deliver_skb(skb, pt_prev, orig_dev);
3717                        pt_prev = NULL;
3718                }
3719                switch (rx_handler(&skb)) {
3720                case RX_HANDLER_CONSUMED:
3721                        ret = NET_RX_SUCCESS;
3722                        goto out;
3723                case RX_HANDLER_ANOTHER:
3724                        goto another_round;
3725                case RX_HANDLER_EXACT:
3726                        deliver_exact = true;
3727                case RX_HANDLER_PASS:
3728                        break;
3729                default:
3730                        BUG();
3731                }
3732        }
3733
3734        if (unlikely(vlan_tx_tag_present(skb))) {
3735                if (vlan_tx_tag_get_id(skb))
3736                        skb->pkt_type = PACKET_OTHERHOST;
3737                /* Note: we might in the future use prio bits
3738                 * and set skb->priority like in vlan_do_receive()
3739                 * For the time being, just ignore Priority Code Point
3740                 */
3741                skb->vlan_tci = 0;
3742        }
3743
3744        /* deliver only exact match when indicated */
3745        null_or_dev = deliver_exact ? skb->dev : NULL;
3746
3747        type = skb->protocol;
3748        list_for_each_entry_rcu(ptype,
3749                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3750                if (ptype->type == type &&
3751                    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3752                     ptype->dev == orig_dev)) {
3753                        if (pt_prev)
3754                                ret = deliver_skb(skb, pt_prev, orig_dev);
3755                        pt_prev = ptype;
3756                }
3757        }
3758
3759        if (pt_prev) {
3760                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3761                        goto drop;
3762                else
3763                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3764        } else {
3765drop:
3766                atomic_long_inc(&skb->dev->rx_dropped);
3767                kfree_skb(skb);
3768                /* Jamal, now you will not able to escape explaining
3769                 * me how you were going to use this. :-)
3770                 */
3771                ret = NET_RX_DROP;
3772        }
3773
3774out:
3775        return ret;
3776}
3777
3778static int __netif_receive_skb(struct sk_buff *skb)
3779{
3780        int ret;
3781
3782        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3783                unsigned long pflags = current->flags;
3784
3785                /*
3786                 * PFMEMALLOC skbs are special, they should
3787                 * - be delivered to SOCK_MEMALLOC sockets only
3788                 * - stay away from userspace
3789                 * - have bounded memory usage
3790                 *
3791                 * Use PF_MEMALLOC as this saves us from propagating the allocation
3792                 * context down to all allocation sites.
3793                 */
3794                current->flags |= PF_MEMALLOC;
3795                ret = __netif_receive_skb_core(skb, true);
3796                tsk_restore_flags(current, pflags, PF_MEMALLOC);
3797        } else
3798                ret = __netif_receive_skb_core(skb, false);
3799
3800        return ret;
3801}
3802
3803static int netif_receive_skb_internal(struct sk_buff *skb)
3804{
3805        int ret;
3806
3807        net_timestamp_check(netdev_tstamp_prequeue, skb);
3808
3809        if (skb_defer_rx_timestamp(skb))
3810                return NET_RX_SUCCESS;
3811
3812        rcu_read_lock();
3813
3814#ifdef CONFIG_RPS
3815        if (static_key_false(&rps_needed)) {
3816                struct rps_dev_flow voidflow, *rflow = &voidflow;
3817                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3818
3819                if (cpu >= 0) {
3820                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3821                        rcu_read_unlock();
3822                        return ret;
3823                }
3824        }
3825#endif
3826        ret = __netif_receive_skb(skb);
3827        rcu_read_unlock();
3828        return ret;
3829}
3830
3831/**
3832 *      netif_receive_skb - process receive buffer from network
3833 *      @skb: buffer to process
3834 *
3835 *      netif_receive_skb() is the main receive data processing function.
3836 *      It always succeeds. The buffer may be dropped during processing
3837 *      for congestion control or by the protocol layers.
3838 *
3839 *      This function may only be called from softirq context and interrupts
3840 *      should be enabled.
3841 *
3842 *      Return values (usually ignored):
3843 *      NET_RX_SUCCESS: no congestion
3844 *      NET_RX_DROP: packet was dropped
3845 */
3846int netif_receive_skb(struct sk_buff *skb)
3847{
3848        trace_netif_receive_skb_entry(skb);
3849
3850        return netif_receive_skb_internal(skb);
3851}
3852EXPORT_SYMBOL(netif_receive_skb);
3853
3854/* Network device is going away, flush any packets still pending
3855 * Called with irqs disabled.
3856 */
3857static void flush_backlog(void *arg)
3858{
3859        struct net_device *dev = arg;
3860        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3861        struct sk_buff *skb, *tmp;
3862
3863        rps_lock(sd);
3864        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3865                if (skb->dev == dev) {
3866                        __skb_unlink(skb, &sd->input_pkt_queue);
3867                        kfree_skb(skb);
3868                        input_queue_head_incr(sd);
3869                }
3870        }
3871        rps_unlock(sd);
3872
3873        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3874                if (skb->dev == dev) {
3875                        __skb_unlink(skb, &sd->process_queue);
3876                        kfree_skb(skb);
3877                        input_queue_head_incr(sd);
3878                }
3879        }
3880}
3881
3882static int BCMFASTPATH_HOST napi_gro_complete(struct sk_buff *skb)
3883{
3884        struct packet_offload *ptype;
3885        __be16 type = skb->protocol;
3886        struct list_head *head = &offload_base;
3887        int err = -ENOENT;
3888
3889        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3890
3891        if (NAPI_GRO_CB(skb)->count == 1) {
3892                skb_shinfo(skb)->gso_size = 0;
3893                goto out;
3894        }
3895
3896        rcu_read_lock();
3897        list_for_each_entry_rcu(ptype, head, list) {
3898                if (ptype->type != type || !ptype->callbacks.gro_complete)
3899                        continue;
3900
3901                err = ptype->callbacks.gro_complete(skb, 0);
3902                break;
3903        }
3904        rcu_read_unlock();
3905
3906        if (err) {
3907                WARN_ON(&ptype->list == head);
3908                kfree_skb(skb);
3909                return NET_RX_SUCCESS;
3910        }
3911
3912out:
3913        return netif_receive_skb_internal(skb);
3914}
3915
3916/* napi->gro_list contains packets ordered by age.
3917 * youngest packets at the head of it.
3918 * Complete skbs in reverse order to reduce latencies.
3919 */
3920void BCMFASTPATH_HOST napi_gro_flush(struct napi_struct *napi, bool flush_old)
3921{
3922        struct sk_buff *skb, *prev = NULL;
3923
3924        /* scan list and build reverse chain */
3925        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3926                skb->prev = prev;
3927                prev = skb;
3928        }
3929
3930        for (skb = prev; skb; skb = prev) {
3931                skb->next = NULL;
3932
3933                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3934                        return;
3935
3936                prev = skb->prev;
3937                napi_gro_complete(skb);
3938                napi->gro_count--;
3939        }
3940
3941        napi->gro_list = NULL;
3942}
3943EXPORT_SYMBOL(napi_gro_flush);
3944
3945static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3946{
3947        struct sk_buff *p;
3948        unsigned int maclen = skb->dev->hard_header_len;
3949        u32 hash = skb_get_hash_raw(skb);
3950
3951        for (p = napi->gro_list; p; p = p->next) {
3952                unsigned long diffs;
3953
3954                NAPI_GRO_CB(p)->flush = 0;
3955
3956                if (hash != skb_get_hash_raw(p)) {
3957                        NAPI_GRO_CB(p)->same_flow = 0;
3958                        continue;
3959                }
3960
3961                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3962                diffs |= p->vlan_tci ^ skb->vlan_tci;
3963                if (maclen == ETH_HLEN)
3964                        diffs |= compare_ether_header(skb_mac_header(p),
3965                                                      skb_mac_header(skb));
3966                else if (!diffs)
3967                        diffs = memcmp(skb_mac_header(p),
3968                                       skb_mac_header(skb),
3969                                       maclen);
3970                NAPI_GRO_CB(p)->same_flow = !diffs;
3971        }
3972}
3973
3974static void skb_gro_reset_offset(struct sk_buff *skb)
3975{
3976        const struct skb_shared_info *pinfo = skb_shinfo(skb);
3977        const skb_frag_t *frag0 = &pinfo->frags[0];
3978
3979        NAPI_GRO_CB(skb)->data_offset = 0;
3980        NAPI_GRO_CB(skb)->frag0 = NULL;
3981        NAPI_GRO_CB(skb)->frag0_len = 0;
3982
3983        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3984            pinfo->nr_frags &&
3985            !PageHighMem(skb_frag_page(frag0))) {
3986                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3987                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3988        }
3989}
3990
3991static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3992{
3993        struct skb_shared_info *pinfo = skb_shinfo(skb);
3994
3995        BUG_ON(skb->end - skb->tail < grow);
3996
3997        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3998
3999        skb->data_len -= grow;
4000        skb->tail += grow;
4001
4002        pinfo->frags[0].page_offset += grow;
4003        skb_frag_size_sub(&pinfo->frags[0], grow);
4004
4005        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4006                skb_frag_unref(skb, 0);
4007                memmove(pinfo->frags, pinfo->frags + 1,
4008                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4009        }
4010}
4011
4012static enum gro_result BCMFASTPATH_HOST dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4013{
4014        struct sk_buff **pp = NULL;
4015        struct packet_offload *ptype;
4016        __be16 type = skb->protocol;
4017        struct list_head *head = &offload_base;
4018        int same_flow;
4019        enum gro_result ret;
4020        int grow;
4021
4022        if (!(skb->dev->features & NETIF_F_GRO))
4023                goto normal;
4024
4025        if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4026                goto normal;
4027
4028        gro_list_prepare(napi, skb);
4029
4030        rcu_read_lock();
4031        list_for_each_entry_rcu(ptype, head, list) {
4032                if (ptype->type != type || !ptype->callbacks.gro_receive)
4033                        continue;
4034
4035                skb_set_network_header(skb, skb_gro_offset(skb));
4036                skb_reset_mac_len(skb);
4037                NAPI_GRO_CB(skb)->same_flow = 0;
4038                NAPI_GRO_CB(skb)->flush = 0;
4039                NAPI_GRO_CB(skb)->free = 0;
4040                NAPI_GRO_CB(skb)->encap_mark = 0;
4041
4042                /* Setup for GRO checksum validation */
4043                switch (skb->ip_summed) {
4044                case CHECKSUM_COMPLETE:
4045                        NAPI_GRO_CB(skb)->csum = skb->csum;
4046                        NAPI_GRO_CB(skb)->csum_valid = 1;
4047                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4048                        break;
4049                case CHECKSUM_UNNECESSARY:
4050                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4051                        NAPI_GRO_CB(skb)->csum_valid = 0;
4052                        break;
4053                default:
4054                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4055                        NAPI_GRO_CB(skb)->csum_valid = 0;
4056                }
4057
4058                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4059                break;
4060        }
4061        rcu_read_unlock();
4062
4063        if (&ptype->list == head)
4064                goto normal;
4065
4066        same_flow = NAPI_GRO_CB(skb)->same_flow;
4067        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4068
4069        if (pp) {
4070                struct sk_buff *nskb = *pp;
4071
4072                *pp = nskb->next;
4073                nskb->next = NULL;
4074                napi_gro_complete(nskb);
4075                napi->gro_count--;
4076        }
4077
4078        if (same_flow)
4079                goto ok;
4080
4081        if (NAPI_GRO_CB(skb)->flush)
4082                goto normal;
4083
4084        if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4085                struct sk_buff *nskb = napi->gro_list;
4086
4087                /* locate the end of the list to select the 'oldest' flow */
4088                while (nskb->next) {
4089                        pp = &nskb->next;
4090                        nskb = *pp;
4091                }
4092                *pp = NULL;
4093                nskb->next = NULL;
4094                napi_gro_complete(nskb);
4095        } else {
4096                napi->gro_count++;
4097        }
4098        NAPI_GRO_CB(skb)->count = 1;
4099        NAPI_GRO_CB(skb)->age = jiffies;
4100        NAPI_GRO_CB(skb)->last = skb;
4101        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4102        skb->next = napi->gro_list;
4103        napi->gro_list = skb;
4104        ret = GRO_HELD;
4105
4106pull:
4107        grow = skb_gro_offset(skb) - skb_headlen(skb);
4108        if (grow > 0)
4109                gro_pull_from_frag0(skb, grow);
4110ok:
4111        return ret;
4112
4113normal:
4114        ret = GRO_NORMAL;
4115        goto pull;
4116}
4117
4118struct packet_offload *gro_find_receive_by_type(__be16 type)
4119{
4120        struct list_head *offload_head = &offload_base;
4121        struct packet_offload *ptype;
4122
4123        list_for_each_entry_rcu(ptype, offload_head, list) {
4124                if (ptype->type != type || !ptype->callbacks.gro_receive)
4125                        continue;
4126                return ptype;
4127        }
4128        return NULL;
4129}
4130EXPORT_SYMBOL(gro_find_receive_by_type);
4131
4132struct packet_offload *gro_find_complete_by_type(__be16 type)
4133{
4134        struct list_head *offload_head = &offload_base;
4135        struct packet_offload *ptype;
4136
4137        list_for_each_entry_rcu(ptype, offload_head, list) {
4138                if (ptype->type != type || !ptype->callbacks.gro_complete)
4139                        continue;
4140                return ptype;
4141        }
4142        return NULL;
4143}
4144EXPORT_SYMBOL(gro_find_complete_by_type);
4145
4146static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4147{
4148        switch (ret) {
4149        case GRO_NORMAL:
4150                if (netif_receive_skb_internal(skb))
4151                        ret = GRO_DROP;
4152                break;
4153
4154        case GRO_DROP:
4155                kfree_skb(skb);
4156                break;
4157
4158        case GRO_MERGED_FREE:
4159                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4160                        kmem_cache_free(skbuff_head_cache, skb);
4161                else
4162                        __kfree_skb(skb);
4163                break;
4164
4165        case GRO_HELD:
4166        case GRO_MERGED:
4167                break;
4168        }
4169
4170        return ret;
4171}
4172
4173
4174gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4175{
4176        trace_napi_gro_receive_entry(skb);
4177
4178        skb_gro_reset_offset(skb);
4179
4180        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4181}
4182EXPORT_SYMBOL(napi_gro_receive);
4183
4184static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4185{
4186        if (unlikely(skb->pfmemalloc)) {
4187                consume_skb(skb);
4188                return;
4189        }
4190        __skb_pull(skb, skb_headlen(skb));
4191        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4192        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4193        skb->vlan_tci = 0;
4194        skb->dev = napi->dev;
4195        skb->skb_iif = 0;
4196        skb->encapsulation = 0;
4197        skb_shinfo(skb)->gso_type = 0;
4198        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4199
4200        napi->skb = skb;
4201}
4202
4203struct sk_buff *napi_get_frags(struct napi_struct *napi)
4204{
4205        struct sk_buff *skb = napi->skb;
4206
4207        if (!skb) {
4208                skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4209                napi->skb = skb;
4210        }
4211        return skb;
4212}
4213EXPORT_SYMBOL(napi_get_frags);
4214
4215static gro_result_t napi_frags_finish(struct napi_struct *napi,
4216                                      struct sk_buff *skb,
4217                                      gro_result_t ret)
4218{
4219        switch (ret) {
4220        case GRO_NORMAL:
4221        case GRO_HELD:
4222                __skb_push(skb, ETH_HLEN);
4223                skb->protocol = eth_type_trans(skb, skb->dev);
4224                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4225                        ret = GRO_DROP;
4226                break;
4227
4228        case GRO_DROP:
4229        case GRO_MERGED_FREE:
4230                napi_reuse_skb(napi, skb);
4231                break;
4232
4233        case GRO_MERGED:
4234                break;
4235        }
4236
4237        return ret;
4238}
4239
4240/* Upper GRO stack assumes network header starts at gro_offset=0
4241 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4242 * We copy ethernet header into skb->data to have a common layout.
4243 */
4244static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4245{
4246        struct sk_buff *skb = napi->skb;
4247        const struct ethhdr *eth;
4248        unsigned int hlen = sizeof(*eth);
4249
4250        napi->skb = NULL;
4251
4252        skb_reset_mac_header(skb);
4253        skb_gro_reset_offset(skb);
4254
4255        eth = skb_gro_header_fast(skb, 0);
4256        if (unlikely(skb_gro_header_hard(skb, hlen))) {
4257                eth = skb_gro_header_slow(skb, hlen, 0);
4258                if (unlikely(!eth)) {
4259                        napi_reuse_skb(napi, skb);
4260                        return NULL;
4261                }
4262        } else {
4263                gro_pull_from_frag0(skb, hlen);
4264                NAPI_GRO_CB(skb)->frag0 += hlen;
4265                NAPI_GRO_CB(skb)->frag0_len -= hlen;
4266        }
4267        __skb_pull(skb, hlen);
4268
4269        /*
4270         * This works because the only protocols we care about don't require
4271         * special handling.
4272         * We'll fix it up properly in napi_frags_finish()
4273         */
4274        skb->protocol = eth->h_proto;
4275
4276        return skb;
4277}
4278
4279gro_result_t napi_gro_frags(struct napi_struct *napi)
4280{
4281        struct sk_buff *skb = napi_frags_skb(napi);
4282
4283        if (!skb)
4284                return GRO_DROP;
4285
4286        trace_napi_gro_frags_entry(skb);
4287
4288        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4289}
4290EXPORT_SYMBOL(napi_gro_frags);
4291
4292/* Compute the checksum from gro_offset and return the folded value
4293 * after adding in any pseudo checksum.
4294 */
4295__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4296{
4297        __wsum wsum;
4298        __sum16 sum;
4299
4300        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4301
4302        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4303        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4304        if (likely(!sum)) {
4305                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4306                    !skb->csum_complete_sw)
4307                        netdev_rx_csum_fault(skb->dev);
4308        }
4309
4310        NAPI_GRO_CB(skb)->csum = wsum;
4311        NAPI_GRO_CB(skb)->csum_valid = 1;
4312
4313        return sum;
4314}
4315EXPORT_SYMBOL(__skb_gro_checksum_complete);
4316
4317/*
4318 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4319 * Note: called with local irq disabled, but exits with local irq enabled.
4320 */
4321static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4322{
4323#ifdef CONFIG_RPS
4324        struct softnet_data *remsd = sd->rps_ipi_list;
4325
4326        if (remsd) {
4327                sd->rps_ipi_list = NULL;
4328
4329                local_irq_enable();
4330
4331                /* Send pending IPI's to kick RPS processing on remote cpus. */
4332                while (remsd) {
4333                        struct softnet_data *next = remsd->rps_ipi_next;
4334
4335                        if (cpu_online(remsd->cpu))
4336                                smp_call_function_single_async(remsd->cpu,
4337                                                           &remsd->csd);
4338                        remsd = next;
4339                }
4340        } else
4341#endif
4342                local_irq_enable();
4343}
4344
4345static int process_backlog(struct napi_struct *napi, int quota)
4346{
4347        int work = 0;
4348        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4349
4350#ifdef CONFIG_RPS
4351        /* Check if we have pending ipi, its better to send them now,
4352         * not waiting net_rx_action() end.
4353         */
4354        if (sd->rps_ipi_list) {
4355                local_irq_disable();
4356                net_rps_action_and_irq_enable(sd);
4357        }
4358#endif
4359        napi->weight = weight_p;
4360        local_irq_disable();
4361        while (1) {
4362                struct sk_buff *skb;
4363
4364                while ((skb = __skb_dequeue(&sd->process_queue))) {
4365                        rcu_read_lock();
4366                        local_irq_enable();
4367                        __netif_receive_skb(skb);
4368                        rcu_read_unlock();
4369                        local_irq_disable();
4370                        input_queue_head_incr(sd);
4371                        if (++work >= quota) {
4372                                local_irq_enable();
4373                                return work;
4374                        }
4375                }
4376
4377                rps_lock(sd);
4378                if (skb_queue_empty(&sd->input_pkt_queue)) {
4379                        /*
4380                         * Inline a custom version of __napi_complete().
4381                         * only current cpu owns and manipulates this napi,
4382                         * and NAPI_STATE_SCHED is the only possible flag set
4383                         * on backlog.
4384                         * We can use a plain write instead of clear_bit(),
4385                         * and we dont need an smp_mb() memory barrier.
4386                         */
4387                        list_del(&napi->poll_list);
4388                        napi->state = 0;
4389                        rps_unlock(sd);
4390
4391                        break;
4392                }
4393
4394                skb_queue_splice_tail_init(&sd->input_pkt_queue,
4395                                           &sd->process_queue);
4396                rps_unlock(sd);
4397        }
4398        local_irq_enable();
4399
4400        return work;
4401}
4402
4403/**
4404 * __napi_schedule - schedule for receive
4405 * @n: entry to schedule
4406 *
4407 * The entry's receive function will be scheduled to run
4408 */
4409void __napi_schedule(struct napi_struct *n)
4410{
4411        unsigned long flags;
4412
4413        local_irq_save(flags);
4414        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4415        local_irq_restore(flags);
4416}
4417EXPORT_SYMBOL(__napi_schedule);
4418
4419void __napi_complete(struct napi_struct *n)
4420{
4421        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4422        BUG_ON(n->gro_list);
4423
4424        list_del(&n->poll_list);
4425        smp_mb__before_atomic();
4426        clear_bit(NAPI_STATE_SCHED, &n->state);
4427}
4428EXPORT_SYMBOL(__napi_complete);
4429
4430void napi_complete(struct napi_struct *n)
4431{
4432        unsigned long flags;
4433
4434        /*
4435         * don't let napi dequeue from the cpu poll list
4436         * just in case its running on a different cpu
4437         */
4438        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4439                return;
4440
4441        napi_gro_flush(n, false);
4442        local_irq_save(flags);
4443        __napi_complete(n);
4444        local_irq_restore(flags);
4445}
4446EXPORT_SYMBOL(napi_complete);
4447
4448/* must be called under rcu_read_lock(), as we dont take a reference */
4449struct napi_struct *napi_by_id(unsigned int napi_id)
4450{
4451        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4452        struct napi_struct *napi;
4453
4454        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4455                if (napi->napi_id == napi_id)
4456                        return napi;
4457
4458        return NULL;
4459}
4460EXPORT_SYMBOL_GPL(napi_by_id);
4461
4462void napi_hash_add(struct napi_struct *napi)
4463{
4464        if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4465
4466                spin_lock(&napi_hash_lock);
4467
4468                /* 0 is not a valid id, we also skip an id that is taken
4469                 * we expect both events to be extremely rare
4470                 */
4471                napi->napi_id = 0;
4472                while (!napi->napi_id) {
4473                        napi->napi_id = ++napi_gen_id;
4474                        if (napi_by_id(napi->napi_id))
4475                                napi->napi_id = 0;
4476                }
4477
4478                hlist_add_head_rcu(&napi->napi_hash_node,
4479                        &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4480
4481                spin_unlock(&napi_hash_lock);
4482        }
4483}
4484EXPORT_SYMBOL_GPL(napi_hash_add);
4485
4486/* Warning : caller is responsible to make sure rcu grace period
4487 * is respected before freeing memory containing @napi
4488 */
4489void napi_hash_del(struct napi_struct *napi)
4490{
4491        spin_lock(&napi_hash_lock);
4492
4493        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4494                hlist_del_rcu(&napi->napi_hash_node);
4495
4496        spin_unlock(&napi_hash_lock);
4497}
4498EXPORT_SYMBOL_GPL(napi_hash_del);
4499
4500void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4501                    int (*poll)(struct napi_struct *, int), int weight)
4502{
4503        INIT_LIST_HEAD(&napi->poll_list);
4504        napi->gro_count = 0;
4505        napi->gro_list = NULL;
4506        napi->skb = NULL;
4507        napi->poll = poll;
4508        if (weight > NAPI_POLL_WEIGHT)
4509                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4510                            weight, dev->name);
4511        napi->weight = weight;
4512        list_add(&napi->dev_list, &dev->napi_list);
4513        napi->dev = dev;
4514#ifdef CONFIG_NETPOLL
4515        spin_lock_init(&napi->poll_lock);
4516        napi->poll_owner = -1;
4517#endif
4518        set_bit(NAPI_STATE_SCHED, &napi->state);
4519}
4520EXPORT_SYMBOL(netif_napi_add);
4521
4522void netif_napi_del(struct napi_struct *napi)
4523{
4524        list_del_init(&napi->dev_list);
4525        napi_free_frags(napi);
4526
4527        kfree_skb_list(napi->gro_list);
4528        napi->gro_list = NULL;
4529        napi->gro_count = 0;
4530}
4531EXPORT_SYMBOL(netif_napi_del);
4532
4533static void net_rx_action(struct softirq_action *h)
4534{
4535        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4536        unsigned long time_limit = jiffies + 2;
4537        int budget = netdev_budget;
4538        void *have;
4539
4540        local_irq_disable();
4541
4542        while (!list_empty(&sd->poll_list)) {
4543                struct napi_struct *n;
4544                int work, weight;
4545
4546                /* If softirq window is exhuasted then punt.
4547                 * Allow this to run for 2 jiffies since which will allow
4548                 * an average latency of 1.5/HZ.
4549                 */
4550                if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4551                        goto softnet_break;
4552
4553                local_irq_enable();
4554
4555                /* Even though interrupts have been re-enabled, this
4556                 * access is safe because interrupts can only add new
4557                 * entries to the tail of this list, and only ->poll()
4558                 * calls can remove this head entry from the list.
4559                 */
4560                n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4561
4562                have = netpoll_poll_lock(n);
4563
4564                weight = n->weight;
4565
4566                /* This NAPI_STATE_SCHED test is for avoiding a race
4567                 * with netpoll's poll_napi().  Only the entity which
4568                 * obtains the lock and sees NAPI_STATE_SCHED set will
4569                 * actually make the ->poll() call.  Therefore we avoid
4570                 * accidentally calling ->poll() when NAPI is not scheduled.
4571                 */
4572                work = 0;
4573                if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4574                        work = n->poll(n, weight);
4575                        trace_napi_poll(n);
4576                }
4577
4578                WARN_ON_ONCE(work > weight);
4579
4580                budget -= work;
4581
4582                local_irq_disable();
4583
4584                /* Drivers must not modify the NAPI state if they
4585                 * consume the entire weight.  In such cases this code
4586                 * still "owns" the NAPI instance and therefore can
4587                 * move the instance around on the list at-will.
4588                 */
4589                if (unlikely(work == weight)) {
4590                        if (unlikely(napi_disable_pending(n))) {
4591                                local_irq_enable();
4592                                napi_complete(n);
4593                                local_irq_disable();
4594                        } else {
4595                                if (n->gro_list) {
4596                                        /* flush too old packets
4597                                         * If HZ < 1000, flush all packets.
4598                                         */
4599                                        local_irq_enable();
4600                                        napi_gro_flush(n, HZ >= 1000);
4601                                        local_irq_disable();
4602                                }
4603                                list_move_tail(&n->poll_list, &sd->poll_list);
4604                        }
4605                }
4606
4607                netpoll_poll_unlock(have);
4608        }
4609out:
4610        net_rps_action_and_irq_enable(sd);
4611
4612        return;
4613
4614softnet_break:
4615        sd->time_squeeze++;
4616        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4617        goto out;
4618}
4619
4620struct netdev_adjacent {
4621        struct net_device *dev;
4622
4623        /* upper master flag, there can only be one master device per list */
4624        bool master;
4625
4626        /* counter for the number of times this device was added to us */
4627        u16 ref_nr;
4628
4629        /* private field for the users */
4630        void *private;
4631
4632        struct list_head list;
4633        struct rcu_head rcu;
4634};
4635
4636static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4637                                                 struct net_device *adj_dev,
4638                                                 struct list_head *adj_list)
4639{
4640        struct netdev_adjacent *adj;
4641
4642        list_for_each_entry(adj, adj_list, list) {
4643                if (adj->dev == adj_dev)
4644                        return adj;
4645        }
4646        return NULL;
4647}
4648
4649/**
4650 * netdev_has_upper_dev - Check if device is linked to an upper device
4651 * @dev: device
4652 * @upper_dev: upper device to check
4653 *
4654 * Find out if a device is linked to specified upper device and return true
4655 * in case it is. Note that this checks only immediate upper device,
4656 * not through a complete stack of devices. The caller must hold the RTNL lock.
4657 */
4658bool netdev_has_upper_dev(struct net_device *dev,
4659                          struct net_device *upper_dev)
4660{
4661        ASSERT_RTNL();
4662
4663        return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4664}
4665EXPORT_SYMBOL(netdev_has_upper_dev);
4666
4667/**
4668 * netdev_has_any_upper_dev - Check if device is linked to some device
4669 * @dev: device
4670 *
4671 * Find out if a device is linked to an upper device and return true in case
4672 * it is. The caller must hold the RTNL lock.
4673 */
4674static bool netdev_has_any_upper_dev(struct net_device *dev)
4675{
4676        ASSERT_RTNL();
4677
4678        return !list_empty(&dev->all_adj_list.upper);
4679}
4680
4681/**
4682 * netdev_master_upper_dev_get - Get master upper device
4683 * @dev: device
4684 *
4685 * Find a master upper device and return pointer to it or NULL in case
4686 * it's not there. The caller must hold the RTNL lock.
4687 */
4688struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4689{
4690        struct netdev_adjacent *upper;
4691
4692        ASSERT_RTNL();
4693
4694        if (list_empty(&dev->adj_list.upper))
4695                return NULL;
4696
4697        upper = list_first_entry(&dev->adj_list.upper,
4698                                 struct netdev_adjacent, list);
4699        if (likely(upper->master))
4700                return upper->dev;
4701        return NULL;
4702}
4703EXPORT_SYMBOL(netdev_master_upper_dev_get);
4704
4705void *netdev_adjacent_get_private(struct list_head *adj_list)
4706{
4707        struct netdev_adjacent *adj;
4708
4709        adj = list_entry(adj_list, struct netdev_adjacent, list);
4710
4711        return adj->private;
4712}
4713EXPORT_SYMBOL(netdev_adjacent_get_private);
4714
4715/**
4716 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4717 * @dev: device
4718 * @iter: list_head ** of the current position
4719 *
4720 * Gets the next device from the dev's upper list, starting from iter
4721 * position. The caller must hold RCU read lock.
4722 */
4723struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4724                                                 struct list_head **iter)
4725{
4726        struct netdev_adjacent *upper;
4727
4728        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4729
4730        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4731
4732        if (&upper->list == &dev->adj_list.upper)
4733                return NULL;
4734
4735        *iter = &upper->list;
4736
4737        return upper->dev;
4738}
4739EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4740
4741/**
4742 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4743 * @dev: device
4744 * @iter: list_head ** of the current position
4745 *
4746 * Gets the next device from the dev's upper list, starting from iter
4747 * position. The caller must hold RCU read lock.
4748 */
4749struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4750                                                     struct list_head **iter)
4751{
4752        struct netdev_adjacent *upper;
4753
4754        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4755
4756        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4757
4758        if (&upper->list == &dev->all_adj_list.upper)
4759                return NULL;
4760
4761        *iter = &upper->list;
4762
4763        return upper->dev;
4764}
4765EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4766
4767/**
4768 * netdev_lower_get_next_private - Get the next ->private from the
4769 *                                 lower neighbour list
4770 * @dev: device
4771 * @iter: list_head ** of the current position
4772 *
4773 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4774 * list, starting from iter position. The caller must hold either hold the
4775 * RTNL lock or its own locking that guarantees that the neighbour lower
4776 * list will remain unchainged.
4777 */
4778void *netdev_lower_get_next_private(struct net_device *dev,
4779                                    struct list_head **iter)
4780{
4781        struct netdev_adjacent *lower;
4782
4783        lower = list_entry(*iter, struct netdev_adjacent, list);
4784
4785        if (&lower->list == &dev->adj_list.lower)
4786                return NULL;
4787
4788        *iter = lower->list.next;
4789
4790        return lower->private;
4791}
4792EXPORT_SYMBOL(netdev_lower_get_next_private);
4793
4794/**
4795 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4796 *                                     lower neighbour list, RCU
4797 *                                     variant
4798 * @dev: device
4799 * @iter: list_head ** of the current position
4800 *
4801 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4802 * list, starting from iter position. The caller must hold RCU read lock.
4803 */
4804void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4805                                        struct list_head **iter)
4806{
4807        struct netdev_adjacent *lower;
4808
4809        WARN_ON_ONCE(!rcu_read_lock_held());
4810
4811        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4812
4813        if (&lower->list == &dev->adj_list.lower)
4814                return NULL;
4815
4816        *iter = &lower->list;
4817
4818        return lower->private;
4819}
4820EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4821
4822/**
4823 * netdev_lower_get_next - Get the next device from the lower neighbour
4824 *                         list
4825 * @dev: device
4826 * @iter: list_head ** of the current position
4827 *
4828 * Gets the next netdev_adjacent from the dev's lower neighbour
4829 * list, starting from iter position. The caller must hold RTNL lock or
4830 * its own locking that guarantees that the neighbour lower
4831 * list will remain unchainged.
4832 */
4833void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4834{
4835        struct netdev_adjacent *lower;
4836
4837        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4838
4839        if (&lower->list == &dev->adj_list.lower)
4840                return NULL;
4841
4842        *iter = &lower->list;
4843
4844        return lower->dev;
4845}
4846EXPORT_SYMBOL(netdev_lower_get_next);
4847
4848/**
4849 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4850 *                                     lower neighbour list, RCU
4851 *                                     variant
4852 * @dev: device
4853 *
4854 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4855 * list. The caller must hold RCU read lock.
4856 */
4857void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4858{
4859        struct netdev_adjacent *lower;
4860
4861        lower = list_first_or_null_rcu(&dev->adj_list.lower,
4862                        struct netdev_adjacent, list);
4863        if (lower)
4864                return lower->private;
4865        return NULL;
4866}
4867EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4868
4869/**
4870 * netdev_master_upper_dev_get_rcu - Get master upper device
4871 * @dev: device
4872 *
4873 * Find a master upper device and return pointer to it or NULL in case
4874 * it's not there. The caller must hold the RCU read lock.
4875 */
4876struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4877{
4878        struct netdev_adjacent *upper;
4879
4880        upper = list_first_or_null_rcu(&dev->adj_list.upper,
4881                                       struct netdev_adjacent, list);
4882        if (upper && likely(upper->master))
4883                return upper->dev;
4884        return NULL;
4885}
4886EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4887
4888static int netdev_adjacent_sysfs_add(struct net_device *dev,
4889                              struct net_device *adj_dev,
4890                              struct list_head *dev_list)
4891{
4892        char linkname[IFNAMSIZ+7];
4893        sprintf(linkname, dev_list == &dev->adj_list.upper ?
4894                "upper_%s" : "lower_%s", adj_dev->name);
4895        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4896                                 linkname);
4897}
4898static void netdev_adjacent_sysfs_del(struct net_device *dev,
4899                               char *name,
4900                               struct list_head *dev_list)
4901{
4902        char linkname[IFNAMSIZ+7];
4903        sprintf(linkname, dev_list == &dev->adj_list.upper ?
4904                "upper_%s" : "lower_%s", name);
4905        sysfs_remove_link(&(dev->dev.kobj), linkname);
4906}
4907
4908static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4909                                                 struct net_device *adj_dev,
4910                                                 struct list_head *dev_list)
4911{
4912        return (dev_list == &dev->adj_list.upper ||
4913                dev_list == &dev->adj_list.lower) &&
4914                net_eq(dev_net(dev), dev_net(adj_dev));
4915}
4916
4917static int __netdev_adjacent_dev_insert(struct net_device *dev,
4918                                        struct net_device *adj_dev,
4919                                        u16 ref_nr,
4920                                        struct list_head *dev_list,
4921                                        void *private, bool master)
4922{
4923        struct netdev_adjacent *adj;
4924        int ret;
4925
4926        adj = __netdev_find_adj(dev, adj_dev, dev_list);
4927
4928        if (adj) {
4929                adj->ref_nr += ref_nr;
4930                return 0;
4931        }
4932
4933        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4934        if (!adj)
4935                return -ENOMEM;
4936
4937        adj->dev = adj_dev;
4938        adj->master = master;
4939        adj->ref_nr = ref_nr;
4940        adj->private = private;
4941        dev_hold(adj_dev);
4942
4943        pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4944                 adj_dev->name, dev->name, adj_dev->name);
4945
4946        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4947                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4948                if (ret)
4949                        goto free_adj;
4950        }
4951
4952        /* Ensure that master link is always the first item in list. */
4953        if (master) {
4954                ret = sysfs_create_link(&(dev->dev.kobj),
4955                                        &(adj_dev->dev.kobj), "master");
4956                if (ret)
4957                        goto remove_symlinks;
4958
4959                list_add_rcu(&adj->list, dev_list);
4960        } else {
4961                list_add_tail_rcu(&adj->list, dev_list);
4962        }
4963
4964        return 0;
4965
4966remove_symlinks:
4967        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4968                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4969free_adj:
4970        kfree(adj);
4971        dev_put(adj_dev);
4972
4973        return ret;
4974}
4975
4976static void __netdev_adjacent_dev_remove(struct net_device *dev,
4977                                         struct net_device *adj_dev,
4978                                         u16 ref_nr,
4979                                         struct list_head *dev_list)
4980{
4981        struct netdev_adjacent *adj;
4982
4983        adj = __netdev_find_adj(dev, adj_dev, dev_list);
4984
4985        if (!adj) {
4986                pr_err("tried to remove device %s from %s\n",
4987                       dev->name, adj_dev->name);
4988                BUG();
4989        }
4990
4991        if (adj->ref_nr > ref_nr) {
4992                pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
4993                         ref_nr, adj->ref_nr-ref_nr);
4994                adj->ref_nr -= ref_nr;
4995                return;
4996        }
4997
4998        if (adj->master)
4999                sysfs_remove_link(&(dev->dev.kobj), "master");
5000
5001        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5002                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5003
5004        list_del_rcu(&adj->list);
5005        pr_debug("dev_put for %s, because link removed from %s to %s\n",
5006                 adj_dev->name, dev->name, adj_dev->name);
5007        dev_put(adj_dev);
5008        kfree_rcu(adj, rcu);
5009}
5010
5011static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5012                                            struct net_device *upper_dev,
5013                                            u16 ref_nr,
5014                                            struct list_head *up_list,
5015                                            struct list_head *down_list,
5016                                            void *private, bool master)
5017{
5018        int ret;
5019
5020        ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5021                                           private, master);
5022        if (ret)
5023                return ret;
5024
5025        ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5026                                           private, false);
5027        if (ret) {
5028                __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5029                return ret;
5030        }
5031
5032        return 0;
5033}
5034
5035static int __netdev_adjacent_dev_link(struct net_device *dev,
5036                                      struct net_device *upper_dev,
5037                                      u16 ref_nr)
5038{
5039        return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5040                                                &dev->all_adj_list.upper,
5041                                                &upper_dev->all_adj_list.lower,
5042                                                NULL, false);
5043}
5044
5045static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5046                                               struct net_device *upper_dev,
5047                                               u16 ref_nr,
5048                                               struct list_head *up_list,
5049                                               struct list_head *down_list)
5050{
5051        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5052        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5053}
5054
5055static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5056                                         struct net_device *upper_dev,
5057                                         u16 ref_nr)
5058{
5059        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5060                                           &dev->all_adj_list.upper,
5061                                           &upper_dev->all_adj_list.lower);
5062}
5063
5064static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5065                                                struct net_device *upper_dev,
5066                                                void *private, bool master)
5067{
5068        int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5069
5070        if (ret)
5071                return ret;
5072
5073        ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5074                                               &dev->adj_list.upper,
5075                                               &upper_dev->adj_list.lower,
5076                                               private, master);
5077        if (ret) {
5078                __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5079                return ret;
5080        }
5081
5082        return 0;
5083}
5084
5085static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5086                                                   struct net_device *upper_dev)
5087{
5088        __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5089        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5090                                           &dev->adj_list.upper,
5091                                           &upper_dev->adj_list.lower);
5092}
5093
5094static int __netdev_upper_dev_link(struct net_device *dev,
5095                                   struct net_device *upper_dev, bool master,
5096                                   void *private)
5097{
5098        struct netdev_adjacent *i, *j, *to_i, *to_j;
5099        int ret = 0;
5100
5101        ASSERT_RTNL();
5102
5103        if (dev == upper_dev)
5104                return -EBUSY;
5105
5106        /* To prevent loops, check if dev is not upper device to upper_dev. */
5107        if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5108                return -EBUSY;
5109
5110        if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5111                return -EEXIST;
5112
5113        if (master && netdev_master_upper_dev_get(dev))
5114                return -EBUSY;
5115
5116        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5117                                                   master);
5118        if (ret)
5119                return ret;
5120
5121        /* Now that we linked these devs, make all the upper_dev's
5122         * all_adj_list.upper visible to every dev's all_adj_list.lower an
5123         * versa, and don't forget the devices itself. All of these
5124         * links are non-neighbours.
5125         */
5126        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5127                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5128                        pr_debug("Interlinking %s with %s, non-neighbour\n",
5129                                 i->dev->name, j->dev->name);
5130                        ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5131                        if (ret)
5132                                goto rollback_mesh;
5133                }
5134        }
5135
5136        /* add dev to every upper_dev's upper device */
5137        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5138                pr_debug("linking %s's upper device %s with %s\n",
5139                         upper_dev->name, i->dev->name, dev->name);
5140                ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5141                if (ret)
5142                        goto rollback_upper_mesh;
5143        }
5144
5145        /* add upper_dev to every dev's lower device */
5146        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5147                pr_debug("linking %s's lower device %s with %s\n", dev->name,
5148                         i->dev->name, upper_dev->name);
5149                ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5150                if (ret)
5151                        goto rollback_lower_mesh;
5152        }
5153
5154        call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5155        return 0;
5156
5157rollback_lower_mesh:
5158        to_i = i;
5159        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5160                if (i == to_i)
5161                        break;
5162                __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5163        }
5164
5165        i = NULL;
5166
5167rollback_upper_mesh:
5168        to_i = i;
5169        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5170                if (i == to_i)
5171                        break;
5172                __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5173        }
5174
5175        i = j = NULL;
5176
5177rollback_mesh:
5178        to_i = i;
5179        to_j = j;
5180        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5181                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5182                        if (i == to_i && j == to_j)
5183                                break;
5184                        __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5185                }
5186                if (i == to_i)
5187                        break;
5188        }
5189
5190        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5191
5192        return ret;
5193}
5194
5195/**
5196 * netdev_upper_dev_link - Add a link to the upper device
5197 * @dev: device
5198 * @upper_dev: new upper device
5199 *
5200 * Adds a link to device which is upper to this one. The caller must hold
5201 * the RTNL lock. On a failure a negative errno code is returned.
5202 * On success the reference counts are adjusted and the function
5203 * returns zero.
5204 */
5205int netdev_upper_dev_link(struct net_device *dev,
5206                          struct net_device *upper_dev)
5207{
5208        return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5209}
5210EXPORT_SYMBOL(netdev_upper_dev_link);
5211
5212/**
5213 * netdev_master_upper_dev_link - Add a master link to the upper device
5214 * @dev: device
5215 * @upper_dev: new upper device
5216 *
5217 * Adds a link to device which is upper to this one. In this case, only
5218 * one master upper device can be linked, although other non-master devices
5219 * might be linked as well. The caller must hold the RTNL lock.
5220 * On a failure a negative errno code is returned. On success the reference
5221 * counts are adjusted and the function returns zero.
5222 */
5223int netdev_master_upper_dev_link(struct net_device *dev,
5224                                 struct net_device *upper_dev)
5225{
5226        return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5227}
5228EXPORT_SYMBOL(netdev_master_upper_dev_link);
5229
5230int netdev_master_upper_dev_link_private(struct net_device *dev,
5231                                         struct net_device *upper_dev,
5232                                         void *private)
5233{
5234        return __netdev_upper_dev_link(dev, upper_dev, true, private);
5235}
5236EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5237
5238/**
5239 * netdev_upper_dev_unlink - Removes a link to upper device
5240 * @dev: device
5241 * @upper_dev: new upper device
5242 *
5243 * Removes a link to device which is upper to this one. The caller must hold
5244 * the RTNL lock.
5245 */
5246void netdev_upper_dev_unlink(struct net_device *dev,
5247                             struct net_device *upper_dev)
5248{
5249        struct netdev_adjacent *i, *j;
5250        ASSERT_RTNL();
5251
5252        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5253
5254        /* Here is the tricky part. We must remove all dev's lower
5255         * devices from all upper_dev's upper devices and vice
5256         * versa, to maintain the graph relationship.
5257         */
5258        list_for_each_entry(i, &dev->all_adj_list.lower, list)
5259                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5260                        __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5261
5262        /* remove also the devices itself from lower/upper device
5263         * list
5264         */
5265        list_for_each_entry(i, &dev->all_adj_list.lower, list)
5266                __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5267
5268        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5269                __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5270
5271        call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5272}
5273EXPORT_SYMBOL(netdev_upper_dev_unlink);
5274
5275void netdev_adjacent_add_links(struct net_device *dev)
5276{
5277        struct netdev_adjacent *iter;
5278
5279        struct net *net = dev_net(dev);
5280
5281        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5282                if (!net_eq(net,dev_net(iter->dev)))
5283                        continue;
5284                netdev_adjacent_sysfs_add(iter->dev, dev,
5285                                          &iter->dev->adj_list.lower);
5286                netdev_adjacent_sysfs_add(dev, iter->dev,
5287                                          &dev->adj_list.upper);
5288        }
5289
5290        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5291                if (!net_eq(net,dev_net(iter->dev)))
5292                        continue;
5293                netdev_adjacent_sysfs_add(iter->dev, dev,
5294                                          &iter->dev->adj_list.upper);
5295                netdev_adjacent_sysfs_add(dev, iter->dev,
5296                                          &dev->adj_list.lower);
5297        }
5298}
5299
5300void netdev_adjacent_del_links(struct net_device *dev)
5301{
5302        struct netdev_adjacent *iter;
5303
5304        struct net *net = dev_net(dev);
5305
5306        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5307                if (!net_eq(net,dev_net(iter->dev)))
5308                        continue;
5309                netdev_adjacent_sysfs_del(iter->dev, dev->name,
5310                                          &iter->dev->adj_list.lower);
5311                netdev_adjacent_sysfs_del(dev, iter->dev->name,
5312                                          &dev->adj_list.upper);
5313        }
5314
5315        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5316                if (!net_eq(net,dev_net(iter->dev)))
5317                        continue;
5318                netdev_adjacent_sysfs_del(iter->dev, dev->name,
5319                                          &iter->dev->adj_list.upper);
5320                netdev_adjacent_sysfs_del(dev, iter->dev->name,
5321                                          &dev->adj_list.lower);
5322        }
5323}
5324
5325void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5326{
5327        struct netdev_adjacent *iter;
5328
5329        struct net *net = dev_net(dev);
5330
5331        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5332                if (!net_eq(net,dev_net(iter->dev)))
5333                        continue;
5334                netdev_adjacent_sysfs_del(iter->dev, oldname,
5335                                          &iter->dev->adj_list.lower);
5336                netdev_adjacent_sysfs_add(iter->dev, dev,
5337                                          &iter->dev->adj_list.lower);
5338        }
5339
5340        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5341                if (!net_eq(net,dev_net(iter->dev)))
5342                        continue;
5343                netdev_adjacent_sysfs_del(iter->dev, oldname,
5344                                          &iter->dev->adj_list.upper);
5345                netdev_adjacent_sysfs_add(iter->dev, dev,
5346                                          &iter->dev->adj_list.upper);
5347        }
5348}
5349
5350void *netdev_lower_dev_get_private(struct net_device *dev,
5351                                   struct net_device *lower_dev)
5352{
5353        struct netdev_adjacent *lower;
5354
5355        if (!lower_dev)
5356                return NULL;
5357        lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5358        if (!lower)
5359                return NULL;
5360
5361        return lower->private;
5362}
5363EXPORT_SYMBOL(netdev_lower_dev_get_private);
5364
5365
5366int dev_get_nest_level(struct net_device *dev,
5367                       bool (*type_check)(struct net_device *dev))
5368{
5369        struct net_device *lower = NULL;
5370        struct list_head *iter;
5371        int max_nest = -1;
5372        int nest;
5373
5374        ASSERT_RTNL();
5375
5376        netdev_for_each_lower_dev(dev, lower, iter) {
5377                nest = dev_get_nest_level(lower, type_check);
5378                if (max_nest < nest)
5379                        max_nest = nest;
5380        }
5381
5382        if (type_check(dev))
5383                max_nest++;
5384
5385        return max_nest;
5386}
5387EXPORT_SYMBOL(dev_get_nest_level);
5388
5389static void dev_change_rx_flags(struct net_device *dev, int flags)
5390{
5391        const struct net_device_ops *ops = dev->netdev_ops;
5392
5393        if (ops->ndo_change_rx_flags)
5394                ops->ndo_change_rx_flags(dev, flags);
5395}
5396
5397static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5398{
5399        unsigned int old_flags = dev->flags;
5400        kuid_t uid;
5401        kgid_t gid;
5402
5403        ASSERT_RTNL();
5404
5405        dev->flags |= IFF_PROMISC;
5406        dev->promiscuity += inc;
5407        if (dev->promiscuity == 0) {
5408                /*
5409                 * Avoid overflow.
5410                 * If inc causes overflow, untouch promisc and return error.
5411                 */
5412                if (inc < 0)
5413                        dev->flags &= ~IFF_PROMISC;
5414                else {
5415                        dev->promiscuity -= inc;
5416                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5417                                dev->name);
5418                        return -EOVERFLOW;
5419                }
5420        }
5421        if (dev->flags != old_flags) {
5422                pr_info("device %s %s promiscuous mode\n",
5423                        dev->name,
5424                        dev->flags & IFF_PROMISC ? "entered" : "left");
5425                if (audit_enabled) {
5426                        current_uid_gid(&uid, &gid);
5427                        audit_log(current->audit_context, GFP_ATOMIC,
5428                                AUDIT_ANOM_PROMISCUOUS,
5429                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5430                                dev->name, (dev->flags & IFF_PROMISC),
5431                                (old_flags & IFF_PROMISC),
5432                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
5433                                from_kuid(&init_user_ns, uid),
5434                                from_kgid(&init_user_ns, gid),
5435                                audit_get_sessionid(current));
5436                }
5437
5438                dev_change_rx_flags(dev, IFF_PROMISC);
5439        }
5440        if (notify)
5441                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5442        return 0;
5443}
5444
5445/**
5446 *      dev_set_promiscuity     - update promiscuity count on a device
5447 *      @dev: device
5448 *      @inc: modifier
5449 *
5450 *      Add or remove promiscuity from a device. While the count in the device
5451 *      remains above zero the interface remains promiscuous. Once it hits zero
5452 *      the device reverts back to normal filtering operation. A negative inc
5453 *      value is used to drop promiscuity on the device.
5454 *      Return 0 if successful or a negative errno code on error.
5455 */
5456int dev_set_promiscuity(struct net_device *dev, int inc)
5457{
5458        unsigned int old_flags = dev->flags;
5459        int err;
5460
5461        err = __dev_set_promiscuity(dev, inc, true);
5462        if (err < 0)
5463                return err;
5464        if (dev->flags != old_flags)
5465                dev_set_rx_mode(dev);
5466        return err;
5467}
5468EXPORT_SYMBOL(dev_set_promiscuity);
5469
5470static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5471{
5472        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5473
5474        ASSERT_RTNL();
5475
5476        dev->flags |= IFF_ALLMULTI;
5477        dev->allmulti += inc;
5478        if (dev->allmulti == 0) {
5479                /*
5480                 * Avoid overflow.
5481                 * If inc causes overflow, untouch allmulti and return error.
5482                 */
5483                if (inc < 0)
5484                        dev->flags &= ~IFF_ALLMULTI;
5485                else {
5486                        dev->allmulti -= inc;
5487                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5488                                dev->name);
5489                        return -EOVERFLOW;
5490                }
5491        }
5492        if (dev->flags ^ old_flags) {
5493                dev_change_rx_flags(dev, IFF_ALLMULTI);
5494                dev_set_rx_mode(dev);
5495                if (notify)
5496                        __dev_notify_flags(dev, old_flags,
5497                                           dev->gflags ^ old_gflags);
5498        }
5499        return 0;
5500}
5501
5502/**
5503 *      dev_set_allmulti        - update allmulti count on a device
5504 *      @dev: device
5505 *      @inc: modifier
5506 *
5507 *      Add or remove reception of all multicast frames to a device. While the
5508 *      count in the device remains above zero the interface remains listening
5509 *      to all interfaces. Once it hits zero the device reverts back to normal
5510 *      filtering operation. A negative @inc value is used to drop the counter
5511 *      when releasing a resource needing all multicasts.
5512 *      Return 0 if successful or a negative errno code on error.
5513 */
5514
5515int dev_set_allmulti(struct net_device *dev, int inc)
5516{
5517        return __dev_set_allmulti(dev, inc, true);
5518}
5519EXPORT_SYMBOL(dev_set_allmulti);
5520
5521/*
5522 *      Upload unicast and multicast address lists to device and
5523 *      configure RX filtering. When the device doesn't support unicast
5524 *      filtering it is put in promiscuous mode while unicast addresses
5525 *      are present.
5526 */
5527void __dev_set_rx_mode(struct net_device *dev)
5528{
5529        const struct net_device_ops *ops = dev->netdev_ops;
5530
5531        /* dev_open will call this function so the list will stay sane. */
5532        if (!(dev->flags&IFF_UP))
5533                return;
5534
5535        if (!netif_device_present(dev))
5536                return;
5537
5538        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5539                /* Unicast addresses changes may only happen under the rtnl,
5540                 * therefore calling __dev_set_promiscuity here is safe.
5541                 */
5542                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5543                        __dev_set_promiscuity(dev, 1, false);
5544                        dev->uc_promisc = true;
5545                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5546                        __dev_set_promiscuity(dev, -1, false);
5547                        dev->uc_promisc = false;
5548                }
5549        }
5550
5551        if (ops->ndo_set_rx_mode)
5552                ops->ndo_set_rx_mode(dev);
5553}
5554
5555void dev_set_rx_mode(struct net_device *dev)
5556{
5557        netif_addr_lock_bh(dev);
5558        __dev_set_rx_mode(dev);
5559        netif_addr_unlock_bh(dev);
5560}
5561
5562/**
5563 *      dev_get_flags - get flags reported to userspace
5564 *      @dev: device
5565 *
5566 *      Get the combination of flag bits exported through APIs to userspace.
5567 */
5568unsigned int dev_get_flags(const struct net_device *dev)
5569{
5570        unsigned int flags;
5571
5572        flags = (dev->flags & ~(IFF_PROMISC |
5573                                IFF_ALLMULTI |
5574                                IFF_RUNNING |
5575                                IFF_LOWER_UP |
5576                                IFF_DORMANT)) |
5577                (dev->gflags & (IFF_PROMISC |
5578                                IFF_ALLMULTI));
5579
5580        if (netif_running(dev)) {
5581                if (netif_oper_up(dev))
5582                        flags |= IFF_RUNNING;
5583                if (netif_carrier_ok(dev))
5584                        flags |= IFF_LOWER_UP;
5585                if (netif_dormant(dev))
5586                        flags |= IFF_DORMANT;
5587        }
5588
5589        return flags;
5590}
5591EXPORT_SYMBOL(dev_get_flags);
5592
5593int __dev_change_flags(struct net_device *dev, unsigned int flags)
5594{
5595        unsigned int old_flags = dev->flags;
5596        int ret;
5597
5598        ASSERT_RTNL();
5599
5600        /*
5601         *      Set the flags on our device.
5602         */
5603
5604        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5605                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5606                               IFF_AUTOMEDIA)) |
5607                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5608                                    IFF_ALLMULTI));
5609
5610        /*
5611         *      Load in the correct multicast list now the flags have changed.
5612         */
5613
5614        if ((old_flags ^ flags) & IFF_MULTICAST)
5615                dev_change_rx_flags(dev, IFF_MULTICAST);
5616
5617        dev_set_rx_mode(dev);
5618
5619        /*
5620         *      Have we downed the interface. We handle IFF_UP ourselves
5621         *      according to user attempts to set it, rather than blindly
5622         *      setting it.
5623         */
5624
5625        ret = 0;
5626        if ((old_flags ^ flags) & IFF_UP)
5627                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5628
5629        if ((flags ^ dev->gflags) & IFF_PROMISC) {
5630                int inc = (flags & IFF_PROMISC) ? 1 : -1;
5631                unsigned int old_flags = dev->flags;
5632
5633                dev->gflags ^= IFF_PROMISC;
5634
5635                if (__dev_set_promiscuity(dev, inc, false) >= 0)
5636                        if (dev->flags != old_flags)
5637                                dev_set_rx_mode(dev);
5638        }
5639
5640        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5641           is important. Some (broken) drivers set IFF_PROMISC, when
5642           IFF_ALLMULTI is requested not asking us and not reporting.
5643         */
5644        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5645                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5646
5647                dev->gflags ^= IFF_ALLMULTI;
5648                __dev_set_allmulti(dev, inc, false);
5649        }
5650
5651        return ret;
5652}
5653
5654void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5655                        unsigned int gchanges)
5656{
5657        unsigned int changes = dev->flags ^ old_flags;
5658
5659        if (gchanges)
5660                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5661
5662        if (changes & IFF_UP) {
5663                if (dev->flags & IFF_UP)
5664                        call_netdevice_notifiers(NETDEV_UP, dev);
5665                else
5666                        call_netdevice_notifiers(NETDEV_DOWN, dev);
5667        }
5668
5669        if (dev->flags & IFF_UP &&
5670            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5671                struct netdev_notifier_change_info change_info;
5672
5673                change_info.flags_changed = changes;
5674                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5675                                              &change_info.info);
5676        }
5677}
5678
5679/**
5680 *      dev_change_flags - change device settings
5681 *      @dev: device
5682 *      @flags: device state flags
5683 *
5684 *      Change settings on device based state flags. The flags are
5685 *      in the userspace exported format.
5686 */
5687int dev_change_flags(struct net_device *dev, unsigned int flags)
5688{
5689        int ret;
5690        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5691
5692        ret = __dev_change_flags(dev, flags);
5693        if (ret < 0)
5694                return ret;
5695
5696        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5697        __dev_notify_flags(dev, old_flags, changes);
5698        return ret;
5699}
5700EXPORT_SYMBOL(dev_change_flags);
5701
5702static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5703{
5704        const struct net_device_ops *ops = dev->netdev_ops;
5705
5706        if (ops->ndo_change_mtu)
5707                return ops->ndo_change_mtu(dev, new_mtu);
5708
5709        dev->mtu = new_mtu;
5710        return 0;
5711}
5712
5713/**
5714 *      dev_set_mtu - Change maximum transfer unit
5715 *      @dev: device
5716 *      @new_mtu: new transfer unit
5717 *
5718 *      Change the maximum transfer size of the network device.
5719 */
5720int dev_set_mtu(struct net_device *dev, int new_mtu)
5721{
5722        int err, orig_mtu;
5723
5724        if (new_mtu == dev->mtu)
5725                return 0;
5726
5727        /*      MTU must be positive.    */
5728        if (new_mtu < 0)
5729                return -EINVAL;
5730
5731        if (!netif_device_present(dev))
5732                return -ENODEV;
5733
5734        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5735        err = notifier_to_errno(err);
5736        if (err)
5737                return err;
5738
5739        orig_mtu = dev->mtu;
5740        err = __dev_set_mtu(dev, new_mtu);
5741
5742        if (!err) {
5743                err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5744                err = notifier_to_errno(err);
5745                if (err) {
5746                        /* setting mtu back and notifying everyone again,
5747                         * so that they have a chance to revert changes.
5748                         */
5749                        __dev_set_mtu(dev, orig_mtu);
5750                        call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5751                }
5752        }
5753        return err;
5754}
5755EXPORT_SYMBOL(dev_set_mtu);
5756
5757/**
5758 *      dev_set_group - Change group this device belongs to
5759 *      @dev: device
5760 *      @new_group: group this device should belong to
5761 */
5762void dev_set_group(struct net_device *dev, int new_group)
5763{
5764        dev->group = new_group;
5765}
5766EXPORT_SYMBOL(dev_set_group);
5767
5768/**
5769 *      dev_set_mac_address - Change Media Access Control Address
5770 *      @dev: device
5771 *      @sa: new address
5772 *
5773 *      Change the hardware (MAC) address of the device
5774 */
5775int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5776{
5777        const struct net_device_ops *ops = dev->netdev_ops;
5778        int err;
5779
5780        if (!ops->ndo_set_mac_address)
5781                return -EOPNOTSUPP;
5782        if (sa->sa_family != dev->type)
5783                return -EINVAL;
5784        if (!netif_device_present(dev))
5785                return -ENODEV;
5786        err = ops->ndo_set_mac_address(dev, sa);
5787        if (err)
5788                return err;
5789        dev->addr_assign_type = NET_ADDR_SET;
5790        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5791        add_device_randomness(dev->dev_addr, dev->addr_len);
5792        return 0;
5793}
5794EXPORT_SYMBOL(dev_set_mac_address);
5795
5796/**
5797 *      dev_change_carrier - Change device carrier
5798 *      @dev: device
5799 *      @new_carrier: new value
5800 *
5801 *      Change device carrier
5802 */
5803int dev_change_carrier(struct net_device *dev, bool new_carrier)
5804{
5805        const struct net_device_ops *ops = dev->netdev_ops;
5806
5807        if (!ops->ndo_change_carrier)
5808                return -EOPNOTSUPP;
5809        if (!netif_device_present(dev))
5810                return -ENODEV;
5811        return ops->ndo_change_carrier(dev, new_carrier);
5812}
5813EXPORT_SYMBOL(dev_change_carrier);
5814
5815/**
5816 *      dev_get_phys_port_id - Get device physical port ID
5817 *      @dev: device
5818 *      @ppid: port ID
5819 *
5820 *      Get device physical port ID
5821 */
5822int dev_get_phys_port_id(struct net_device *dev,
5823                         struct netdev_phys_port_id *ppid)
5824{
5825        const struct net_device_ops *ops = dev->netdev_ops;
5826
5827        if (!ops->ndo_get_phys_port_id)
5828                return -EOPNOTSUPP;
5829        return ops->ndo_get_phys_port_id(dev, ppid);
5830}
5831EXPORT_SYMBOL(dev_get_phys_port_id);
5832
5833/**
5834 *      dev_new_index   -       allocate an ifindex
5835 *      @net: the applicable net namespace
5836 *
5837 *      Returns a suitable unique value for a new device interface
5838 *      number.  The caller must hold the rtnl semaphore or the
5839 *      dev_base_lock to be sure it remains unique.
5840 */
5841static int dev_new_index(struct net *net)
5842{
5843        int ifindex = net->ifindex;
5844        for (;;) {
5845                if (++ifindex <= 0)
5846                        ifindex = 1;
5847                if (!__dev_get_by_index(net, ifindex))
5848                        return net->ifindex = ifindex;
5849        }
5850}
5851
5852/* Delayed registration/unregisteration */
5853static LIST_HEAD(net_todo_list);
5854DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5855
5856static void net_set_todo(struct net_device *dev)
5857{
5858        list_add_tail(&dev->todo_list, &net_todo_list);
5859        dev_net(dev)->dev_unreg_count++;
5860}
5861
5862static void rollback_registered_many(struct list_head *head)
5863{
5864        struct net_device *dev, *tmp;
5865        LIST_HEAD(close_head);
5866
5867        BUG_ON(dev_boot_phase);
5868        ASSERT_RTNL();
5869
5870        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5871                /* Some devices call without registering
5872                 * for initialization unwind. Remove those
5873                 * devices and proceed with the remaining.
5874                 */
5875                if (dev->reg_state == NETREG_UNINITIALIZED) {
5876                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5877                                 dev->name, dev);
5878
5879                        WARN_ON(1);
5880                        list_del(&dev->unreg_list);
5881                        continue;
5882                }
5883                dev->dismantle = true;
5884                BUG_ON(dev->reg_state != NETREG_REGISTERED);
5885        }
5886
5887        /* If device is running, close it first. */
5888        list_for_each_entry(dev, head, unreg_list)
5889                list_add_tail(&dev->close_list, &close_head);
5890        dev_close_many(&close_head);
5891
5892        list_for_each_entry(dev, head, unreg_list) {
5893                /* And unlink it from device chain. */
5894                unlist_netdevice(dev);
5895
5896                dev->reg_state = NETREG_UNREGISTERING;
5897                on_each_cpu(flush_backlog, dev, 1);
5898        }
5899
5900        synchronize_net();
5901
5902        list_for_each_entry(dev, head, unreg_list) {
5903                /* Shutdown queueing discipline. */
5904                dev_shutdown(dev);
5905
5906
5907                /* Notify protocols, that we are about to destroy
5908                   this device. They should clean all the things.
5909                */
5910                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5911
5912                /*
5913                 *      Flush the unicast and multicast chains
5914                 */
5915                dev_uc_flush(dev);
5916                dev_mc_flush(dev);
5917
5918                if (dev->netdev_ops->ndo_uninit)
5919                        dev->netdev_ops->ndo_uninit(dev);
5920
5921                if (!dev->rtnl_link_ops ||
5922                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5923                        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5924
5925                /* Notifier chain MUST detach us all upper devices. */
5926                WARN_ON(netdev_has_any_upper_dev(dev));
5927
5928                /* Remove entries from kobject tree */
5929                netdev_unregister_kobject(dev);
5930#ifdef CONFIG_XPS
5931                /* Remove XPS queueing entries */
5932                netif_reset_xps_queues_gt(dev, 0);
5933#endif
5934        }
5935
5936        synchronize_net();
5937
5938        list_for_each_entry(dev, head, unreg_list)
5939                dev_put(dev);
5940}
5941
5942static void rollback_registered(struct net_device *dev)
5943{
5944        LIST_HEAD(single);
5945
5946        list_add(&dev->unreg_list, &single);
5947        rollback_registered_many(&single);
5948        list_del(&single);
5949}
5950
5951static netdev_features_t netdev_fix_features(struct net_device *dev,
5952        netdev_features_t features)
5953{
5954        /* Fix illegal checksum combinations */
5955        if ((features & NETIF_F_HW_CSUM) &&
5956            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5957                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5958                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5959        }
5960
5961        /* TSO requires that SG is present as well. */
5962        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5963                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5964                features &= ~NETIF_F_ALL_TSO;
5965        }
5966
5967        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5968                                        !(features & NETIF_F_IP_CSUM)) {
5969                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5970                features &= ~NETIF_F_TSO;
5971                features &= ~NETIF_F_TSO_ECN;
5972        }
5973
5974        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5975                                         !(features & NETIF_F_IPV6_CSUM)) {
5976                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5977                features &= ~NETIF_F_TSO6;
5978        }
5979
5980        /* TSO ECN requires that TSO is present as well. */
5981        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5982                features &= ~NETIF_F_TSO_ECN;
5983
5984        /* Software GSO depends on SG. */
5985        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5986                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5987                features &= ~NETIF_F_GSO;
5988        }
5989
5990        /* UFO needs SG and checksumming */
5991        if (features & NETIF_F_UFO) {
5992                /* maybe split UFO into V4 and V6? */
5993                if (!((features & NETIF_F_GEN_CSUM) ||
5994                    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5995                            == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5996                        netdev_dbg(dev,
5997                                "Dropping NETIF_F_UFO since no checksum offload features.\n");
5998                        features &= ~NETIF_F_UFO;
5999                }
6000
6001                if (!(features & NETIF_F_SG)) {
6002                        netdev_dbg(dev,
6003                                "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6004                        features &= ~NETIF_F_UFO;
6005                }
6006        }
6007
6008#ifdef CONFIG_NET_RX_BUSY_POLL
6009        if (dev->netdev_ops->ndo_busy_poll)
6010                features |= NETIF_F_BUSY_POLL;
6011        else
6012#endif
6013                features &= ~NETIF_F_BUSY_POLL;
6014
6015        return features;
6016}
6017
6018int __netdev_update_features(struct net_device *dev)
6019{
6020        netdev_features_t features;
6021        int err = 0;
6022
6023        ASSERT_RTNL();
6024
6025        features = netdev_get_wanted_features(dev);
6026
6027        if (dev->netdev_ops->ndo_fix_features)
6028                features = dev->netdev_ops->ndo_fix_features(dev, features);
6029
6030        /* driver might be less strict about feature dependencies */
6031        features = netdev_fix_features(dev, features);
6032
6033        if (dev->features == features)
6034                return 0;
6035
6036        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6037                &dev->features, &features);
6038
6039        if (dev->netdev_ops->ndo_set_features)
6040                err = dev->netdev_ops->ndo_set_features(dev, features);
6041
6042        if (unlikely(err < 0)) {
6043                netdev_err(dev,
6044                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
6045                        err, &features, &dev->features);
6046                return -1;
6047        }
6048
6049        if (!err)
6050                dev->features = features;
6051
6052        return 1;
6053}
6054
6055/**
6056 *      netdev_update_features - recalculate device features
6057 *      @dev: the device to check
6058 *
6059 *      Recalculate dev->features set and send notifications if it
6060 *      has changed. Should be called after driver or hardware dependent
6061 *      conditions might have changed that influence the features.
6062 */
6063void netdev_update_features(struct net_device *dev)
6064{
6065        if (__netdev_update_features(dev))
6066                netdev_features_change(dev);
6067}
6068EXPORT_SYMBOL(netdev_update_features);
6069
6070/**
6071 *      netdev_change_features - recalculate device features
6072 *      @dev: the device to check
6073 *
6074 *      Recalculate dev->features set and send notifications even
6075 *      if they have not changed. Should be called instead of
6076 *      netdev_update_features() if also dev->vlan_features might
6077 *      have changed to allow the changes to be propagated to stacked
6078 *      VLAN devices.
6079 */
6080void netdev_change_features(struct net_device *dev)
6081{
6082        __netdev_update_features(dev);
6083        netdev_features_change(dev);
6084}
6085EXPORT_SYMBOL(netdev_change_features);
6086
6087/**
6088 *      netif_stacked_transfer_operstate -      transfer operstate
6089 *      @rootdev: the root or lower level device to transfer state from
6090 *      @dev: the device to transfer operstate to
6091 *
6092 *      Transfer operational state from root to device. This is normally
6093 *      called when a stacking relationship exists between the root
6094 *      device and the device(a leaf device).
6095 */
6096void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6097                                        struct net_device *dev)
6098{
6099        if (rootdev->operstate == IF_OPER_DORMANT)
6100                netif_dormant_on(dev);
6101        else
6102                netif_dormant_off(dev);
6103
6104        if (netif_carrier_ok(rootdev)) {
6105                if (!netif_carrier_ok(dev))
6106                        netif_carrier_on(dev);
6107        } else {
6108                if (netif_carrier_ok(dev))
6109                        netif_carrier_off(dev);
6110        }
6111}
6112EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6113
6114#ifdef CONFIG_SYSFS
6115static int netif_alloc_rx_queues(struct net_device *dev)
6116{
6117        unsigned int i, count = dev->num_rx_queues;
6118        struct netdev_rx_queue *rx;
6119
6120        BUG_ON(count < 1);
6121
6122        rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6123        if (!rx)
6124                return -ENOMEM;
6125
6126        dev->_rx = rx;
6127
6128        for (i = 0; i < count; i++)
6129                rx[i].dev = dev;
6130        return 0;
6131}
6132#endif
6133
6134static void netdev_init_one_queue(struct net_device *dev,
6135                                  struct netdev_queue *queue, void *_unused)
6136{
6137        /* Initialize queue lock */
6138        spin_lock_init(&queue->_xmit_lock);
6139        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6140        queue->xmit_lock_owner = -1;
6141        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6142        queue->dev = dev;
6143#ifdef CONFIG_BQL
6144        dql_init(&queue->dql, HZ);
6145#endif
6146}
6147
6148static void netif_free_tx_queues(struct net_device *dev)
6149{
6150        kvfree(dev->_tx);
6151}
6152
6153static int netif_alloc_netdev_queues(struct net_device *dev)
6154{
6155        unsigned int count = dev->num_tx_queues;
6156        struct netdev_queue *tx;
6157        size_t sz = count * sizeof(*tx);
6158
6159        if (count < 1 || count > 0xffff)
6160                return -EINVAL;
6161
6162        tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6163        if (!tx) {
6164                tx = vzalloc(sz);
6165                if (!tx)
6166                        return -ENOMEM;
6167        }
6168        dev->_tx = tx;
6169
6170        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6171        spin_lock_init(&dev->tx_global_lock);
6172
6173        return 0;
6174}
6175
6176/**
6177 *      register_netdevice      - register a network device
6178 *      @dev: device to register
6179 *
6180 *      Take a completed network device structure and add it to the kernel
6181 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6182 *      chain. 0 is returned on success. A negative errno code is returned
6183 *      on a failure to set up the device, or if the name is a duplicate.
6184 *
6185 *      Callers must hold the rtnl semaphore. You may want
6186 *      register_netdev() instead of this.
6187 *
6188 *      BUGS:
6189 *      The locking appears insufficient to guarantee two parallel registers
6190 *      will not get the same name.
6191 */
6192
6193int register_netdevice(struct net_device *dev)
6194{
6195        int ret;
6196        struct net *net = dev_net(dev);
6197
6198        BUG_ON(dev_boot_phase);
6199        ASSERT_RTNL();
6200
6201        might_sleep();
6202
6203        /* When net_device's are persistent, this will be fatal. */
6204        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6205        BUG_ON(!net);
6206
6207        spin_lock_init(&dev->addr_list_lock);
6208        netdev_set_addr_lockdep_class(dev);
6209
6210        dev->iflink = -1;
6211
6212        ret = dev_get_valid_name(net, dev, dev->name);
6213        if (ret < 0)
6214                goto out;
6215
6216        /* Init, if this function is available */
6217        if (dev->netdev_ops->ndo_init) {
6218                ret = dev->netdev_ops->ndo_init(dev);
6219                if (ret) {
6220                        if (ret > 0)
6221                                ret = -EIO;
6222                        goto out;
6223                }
6224        }
6225
6226        if (((dev->hw_features | dev->features) &
6227             NETIF_F_HW_VLAN_CTAG_FILTER) &&
6228            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6229             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6230                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6231                ret = -EINVAL;
6232                goto err_uninit;
6233        }
6234
6235        ret = -EBUSY;
6236        if (!dev->ifindex)
6237                dev->ifindex = dev_new_index(net);
6238        else if (__dev_get_by_index(net, dev->ifindex))
6239                goto err_uninit;
6240
6241        if (dev->iflink == -1)
6242                dev->iflink = dev->ifindex;
6243
6244        /* Transfer changeable features to wanted_features and enable
6245         * software offloads (GSO and GRO).
6246         */
6247        dev->hw_features |= NETIF_F_SOFT_FEATURES;
6248        dev->features |= NETIF_F_SOFT_FEATURES;
6249        dev->wanted_features = dev->features & dev->hw_features;
6250
6251        if (!(dev->flags & IFF_LOOPBACK)) {
6252                dev->hw_features |= NETIF_F_NOCACHE_COPY;
6253        }
6254
6255        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6256         */
6257        dev->vlan_features |= NETIF_F_HIGHDMA;
6258
6259        /* Make NETIF_F_SG inheritable to tunnel devices.
6260         */
6261        dev->hw_enc_features |= NETIF_F_SG;
6262
6263        /* Make NETIF_F_SG inheritable to MPLS.
6264         */
6265        dev->mpls_features |= NETIF_F_SG;
6266
6267        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6268        ret = notifier_to_errno(ret);
6269        if (ret)
6270                goto err_uninit;
6271
6272        ret = netdev_register_kobject(dev);
6273        if (ret)
6274                goto err_uninit;
6275        dev->reg_state = NETREG_REGISTERED;
6276
6277        __netdev_update_features(dev);
6278
6279        /*
6280         *      Default initial state at registry is that the
6281         *      device is present.
6282         */
6283
6284        set_bit(__LINK_STATE_PRESENT, &dev->state);
6285
6286        linkwatch_init_dev(dev);
6287
6288        dev_init_scheduler(dev);
6289        dev_hold(dev);
6290        list_netdevice(dev);
6291        add_device_randomness(dev->dev_addr, dev->addr_len);
6292
6293        /* If the device has permanent device address, driver should
6294         * set dev_addr and also addr_assign_type should be set to
6295         * NET_ADDR_PERM (default value).
6296         */
6297        if (dev->addr_assign_type == NET_ADDR_PERM)
6298                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6299
6300        /* Notify protocols, that a new device appeared. */
6301        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6302        ret = notifier_to_errno(ret);
6303        if (ret) {
6304                rollback_registered(dev);
6305                dev->reg_state = NETREG_UNREGISTERED;
6306        }
6307        /*
6308         *      Prevent userspace races by waiting until the network
6309         *      device is fully setup before sending notifications.
6310         */
6311        if (!dev->rtnl_link_ops ||
6312            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6313                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6314
6315out:
6316        return ret;
6317
6318err_uninit:
6319        if (dev->netdev_ops->ndo_uninit)
6320                dev->netdev_ops->ndo_uninit(dev);
6321        goto out;
6322}
6323EXPORT_SYMBOL(register_netdevice);
6324
6325/**
6326 *      init_dummy_netdev       - init a dummy network device for NAPI
6327 *      @dev: device to init
6328 *
6329 *      This takes a network device structure and initialize the minimum
6330 *      amount of fields so it can be used to schedule NAPI polls without
6331 *      registering a full blown interface. This is to be used by drivers
6332 *      that need to tie several hardware interfaces to a single NAPI
6333 *      poll scheduler due to HW limitations.
6334 */
6335int init_dummy_netdev(struct net_device *dev)
6336{
6337        /* Clear everything. Note we don't initialize spinlocks
6338         * are they aren't supposed to be taken by any of the
6339         * NAPI code and this dummy netdev is supposed to be
6340         * only ever used for NAPI polls
6341         */
6342        memset(dev, 0, sizeof(struct net_device));
6343
6344        /* make sure we BUG if trying to hit standard
6345         * register/unregister code path
6346         */
6347        dev->reg_state = NETREG_DUMMY;
6348
6349        /* NAPI wants this */
6350        INIT_LIST_HEAD(&dev->napi_list);
6351
6352        /* a dummy interface is started by default */
6353        set_bit(__LINK_STATE_PRESENT, &dev->state);
6354        set_bit(__LINK_STATE_START, &dev->state);
6355
6356        /* Note : We dont allocate pcpu_refcnt for dummy devices,
6357         * because users of this 'device' dont need to change
6358         * its refcount.
6359         */
6360
6361        return 0;
6362}
6363EXPORT_SYMBOL_GPL(init_dummy_netdev);
6364
6365
6366/**
6367 *      register_netdev - register a network device
6368 *      @dev: device to register
6369 *
6370 *      Take a completed network device structure and add it to the kernel
6371 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6372 *      chain. 0 is returned on success. A negative errno code is returned
6373 *      on a failure to set up the device, or if the name is a duplicate.
6374 *
6375 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6376 *      and expands the device name if you passed a format string to
6377 *      alloc_netdev.
6378 */
6379int register_netdev(struct net_device *dev)
6380{
6381        int err;
6382
6383        rtnl_lock();
6384        err = register_netdevice(dev);
6385        rtnl_unlock();
6386        return err;
6387}
6388EXPORT_SYMBOL(register_netdev);
6389
6390int netdev_refcnt_read(const struct net_device *dev)
6391{
6392        int i, refcnt = 0;
6393
6394        for_each_possible_cpu(i)
6395                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6396        return refcnt;
6397}
6398EXPORT_SYMBOL(netdev_refcnt_read);
6399
6400/**
6401 * netdev_wait_allrefs - wait until all references are gone.
6402 * @dev: target net_device
6403 *
6404 * This is called when unregistering network devices.
6405 *
6406 * Any protocol or device that holds a reference should register
6407 * for netdevice notification, and cleanup and put back the
6408 * reference if they receive an UNREGISTER event.
6409 * We can get stuck here if buggy protocols don't correctly
6410 * call dev_put.
6411 */
6412static void netdev_wait_allrefs(struct net_device *dev)
6413{
6414        unsigned long rebroadcast_time, warning_time;
6415        int refcnt;
6416
6417        linkwatch_forget_dev(dev);
6418
6419        rebroadcast_time = warning_time = jiffies;
6420        refcnt = netdev_refcnt_read(dev);
6421
6422        while (refcnt != 0) {
6423                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6424                        rtnl_lock();
6425
6426                        /* Rebroadcast unregister notification */
6427                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6428
6429                        __rtnl_unlock();
6430                        rcu_barrier();
6431                        rtnl_lock();
6432
6433                        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6434                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6435                                     &dev->state)) {
6436                                /* We must not have linkwatch events
6437                                 * pending on unregister. If this
6438                                 * happens, we simply run the queue
6439                                 * unscheduled, resulting in a noop
6440                                 * for this device.
6441                                 */
6442                                linkwatch_run_queue();
6443                        }
6444
6445                        __rtnl_unlock();
6446
6447                        rebroadcast_time = jiffies;
6448                }
6449
6450                msleep(250);
6451
6452                refcnt = netdev_refcnt_read(dev);
6453
6454                if (time_after(jiffies, warning_time + 10 * HZ)) {
6455                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6456                                 dev->name, refcnt);
6457                        warning_time = jiffies;
6458                }
6459        }
6460}
6461
6462/* The sequence is:
6463 *
6464 *      rtnl_lock();
6465 *      ...
6466 *      register_netdevice(x1);
6467 *      register_netdevice(x2);
6468 *      ...
6469 *      unregister_netdevice(y1);
6470 *      unregister_netdevice(y2);
6471 *      ...
6472 *      rtnl_unlock();
6473 *      free_netdev(y1);
6474 *      free_netdev(y2);
6475 *
6476 * We are invoked by rtnl_unlock().
6477 * This allows us to deal with problems:
6478 * 1) We can delete sysfs objects which invoke hotplug
6479 *    without deadlocking with linkwatch via keventd.
6480 * 2) Since we run with the RTNL semaphore not held, we can sleep
6481 *    safely in order to wait for the netdev refcnt to drop to zero.
6482 *
6483 * We must not return until all unregister events added during
6484 * the interval the lock was held have been completed.
6485 */
6486void netdev_run_todo(void)
6487{
6488        struct list_head list;
6489
6490        /* Snapshot list, allow later requests */
6491        list_replace_init(&net_todo_list, &list);
6492
6493        __rtnl_unlock();
6494
6495
6496        /* Wait for rcu callbacks to finish before next phase */
6497        if (!list_empty(&list))
6498                rcu_barrier();
6499
6500        while (!list_empty(&list)) {
6501                struct net_device *dev
6502                        = list_first_entry(&list, struct net_device, todo_list);
6503                list_del(&dev->todo_list);
6504
6505                rtnl_lock();
6506                call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6507                __rtnl_unlock();
6508
6509                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6510                        pr_err("network todo '%s' but state %d\n",
6511                               dev->name, dev->reg_state);
6512                        dump_stack();
6513                        continue;
6514                }
6515
6516                dev->reg_state = NETREG_UNREGISTERED;
6517
6518                netdev_wait_allrefs(dev);
6519
6520                /* paranoia */
6521                BUG_ON(netdev_refcnt_read(dev));
6522                WARN_ON(rcu_access_pointer(dev->ip_ptr));
6523                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6524                WARN_ON(dev->dn_ptr);
6525
6526                if (dev->destructor)
6527                        dev->destructor(dev);
6528
6529                /* Report a network device has been unregistered */
6530                rtnl_lock();
6531                dev_net(dev)->dev_unreg_count--;
6532                __rtnl_unlock();
6533                wake_up(&netdev_unregistering_wq);
6534
6535                /* Free network device */
6536                kobject_put(&dev->dev.kobj);
6537        }
6538}
6539
6540/* Convert net_device_stats to rtnl_link_stats64.  They have the same
6541 * fields in the same order, with only the type differing.
6542 */
6543void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6544                             const struct net_device_stats *netdev_stats)
6545{
6546#if BITS_PER_LONG == 64
6547        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6548        memcpy(stats64, netdev_stats, sizeof(*stats64));
6549#else
6550        size_t i, n = sizeof(*stats64) / sizeof(u64);
6551        const unsigned long *src = (const unsigned long *)netdev_stats;
6552        u64 *dst = (u64 *)stats64;
6553
6554        BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6555                     sizeof(*stats64) / sizeof(u64));
6556        for (i = 0; i < n; i++)
6557                dst[i] = src[i];
6558#endif
6559}
6560EXPORT_SYMBOL(netdev_stats_to_stats64);
6561
6562/**
6563 *      dev_get_stats   - get network device statistics
6564 *      @dev: device to get statistics from
6565 *      @storage: place to store stats
6566 *
6567 *      Get network statistics from device. Return @storage.
6568 *      The device driver may provide its own method by setting
6569 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6570 *      otherwise the internal statistics structure is used.
6571 */
6572struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6573                                        struct rtnl_link_stats64 *storage)
6574{
6575        const struct net_device_ops *ops = dev->netdev_ops;
6576
6577        if (ops->ndo_get_stats64) {
6578                memset(storage, 0, sizeof(*storage));
6579                ops->ndo_get_stats64(dev, storage);
6580        } else if (ops->ndo_get_stats) {
6581                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6582        } else {
6583                netdev_stats_to_stats64(storage, &dev->stats);
6584        }
6585        storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6586        storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6587        return storage;
6588}
6589EXPORT_SYMBOL(dev_get_stats);
6590
6591struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6592{
6593        struct netdev_queue *queue = dev_ingress_queue(dev);
6594
6595#ifdef CONFIG_NET_CLS_ACT
6596        if (queue)
6597                return queue;
6598        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6599        if (!queue)
6600                return NULL;
6601        netdev_init_one_queue(dev, queue, NULL);
6602        queue->qdisc = &noop_qdisc;
6603        queue->qdisc_sleeping = &noop_qdisc;
6604        rcu_assign_pointer(dev->ingress_queue, queue);
6605#endif
6606        return queue;
6607}
6608
6609static const struct ethtool_ops default_ethtool_ops;
6610
6611void netdev_set_default_ethtool_ops(struct net_device *dev,
6612                                    const struct ethtool_ops *ops)
6613{
6614        if (dev->ethtool_ops == &default_ethtool_ops)
6615                dev->ethtool_ops = ops;
6616}
6617EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6618
6619void netdev_freemem(struct net_device *dev)
6620{
6621        char *addr = (char *)dev - dev->padded;
6622
6623        kvfree(addr);
6624}
6625
6626/**
6627 *      alloc_netdev_mqs - allocate network device
6628 *      @sizeof_priv:           size of private data to allocate space for
6629 *      @name:                  device name format string
6630 *      @name_assign_type:      origin of device name
6631 *      @setup:                 callback to initialize device
6632 *      @txqs:                  the number of TX subqueues to allocate
6633 *      @rxqs:                  the number of RX subqueues to allocate
6634 *
6635 *      Allocates a struct net_device with private data area for driver use
6636 *      and performs basic initialization.  Also allocates subqueue structs
6637 *      for each queue on the device.
6638 */
6639struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6640                unsigned char name_assign_type,
6641                void (*setup)(struct net_device *),
6642                unsigned int txqs, unsigned int rxqs)
6643{
6644        struct net_device *dev;
6645        size_t alloc_size;
6646        struct net_device *p;
6647
6648        BUG_ON(strlen(name) >= sizeof(dev->name));
6649
6650        if (txqs < 1) {
6651                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6652                return NULL;
6653        }
6654
6655#ifdef CONFIG_SYSFS
6656        if (rxqs < 1) {
6657                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6658                return NULL;
6659        }
6660#endif
6661
6662        alloc_size = sizeof(struct net_device);
6663        if (sizeof_priv) {
6664                /* ensure 32-byte alignment of private area */
6665                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6666                alloc_size += sizeof_priv;
6667        }
6668        /* ensure 32-byte alignment of whole construct */
6669        alloc_size += NETDEV_ALIGN - 1;
6670
6671        p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6672        if (!p)
6673                p = vzalloc(alloc_size);
6674        if (!p)
6675                return NULL;
6676
6677        dev = PTR_ALIGN(p, NETDEV_ALIGN);
6678        dev->padded = (char *)dev - (char *)p;
6679
6680        dev->pcpu_refcnt = alloc_percpu(int);
6681        if (!dev->pcpu_refcnt)
6682                goto free_dev;
6683
6684        if (dev_addr_init(dev))
6685                goto free_pcpu;
6686
6687        dev_mc_init(dev);
6688        dev_uc_init(dev);
6689
6690        dev_net_set(dev, &init_net);
6691
6692        dev->gso_max_size = GSO_MAX_SIZE;
6693        dev->gso_max_segs = GSO_MAX_SEGS;
6694        dev->gso_min_segs = 0;
6695
6696        INIT_LIST_HEAD(&dev->napi_list);
6697        INIT_LIST_HEAD(&dev->unreg_list);
6698        INIT_LIST_HEAD(&dev->close_list);
6699        INIT_LIST_HEAD(&dev->link_watch_list);
6700        INIT_LIST_HEAD(&dev->adj_list.upper);
6701        INIT_LIST_HEAD(&dev->adj_list.lower);
6702        INIT_LIST_HEAD(&dev->all_adj_list.upper);
6703        INIT_LIST_HEAD(&dev->all_adj_list.lower);
6704        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6705        setup(dev);
6706
6707        dev->num_tx_queues = txqs;
6708        dev->real_num_tx_queues = txqs;
6709        if (netif_alloc_netdev_queues(dev))
6710                goto free_all;
6711
6712#ifdef CONFIG_SYSFS
6713        dev->num_rx_queues = rxqs;
6714        dev->real_num_rx_queues = rxqs;
6715        if (netif_alloc_rx_queues(dev))
6716                goto free_all;
6717#endif
6718
6719        strcpy(dev->name, name);
6720        dev->name_assign_type = name_assign_type;
6721        dev->group = INIT_NETDEV_GROUP;
6722        if (!dev->ethtool_ops)
6723                dev->ethtool_ops = &default_ethtool_ops;
6724        return dev;
6725
6726free_all:
6727        free_netdev(dev);
6728        return NULL;
6729
6730free_pcpu:
6731        free_percpu(dev->pcpu_refcnt);
6732free_dev:
6733        netdev_freemem(dev);
6734        return NULL;
6735}
6736EXPORT_SYMBOL(alloc_netdev_mqs);
6737
6738/**
6739 *      free_netdev - free network device
6740 *      @dev: device
6741 *
6742 *      This function does the last stage of destroying an allocated device
6743 *      interface. The reference to the device object is released.
6744 *      If this is the last reference then it will be freed.
6745 */
6746void free_netdev(struct net_device *dev)
6747{
6748        struct napi_struct *p, *n;
6749
6750        release_net(dev_net(dev));
6751
6752        netif_free_tx_queues(dev);
6753#ifdef CONFIG_SYSFS
6754        kfree(dev->_rx);
6755#endif
6756
6757        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6758
6759        /* Flush device addresses */
6760        dev_addr_flush(dev);
6761
6762        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6763                netif_napi_del(p);
6764
6765        free_percpu(dev->pcpu_refcnt);
6766        dev->pcpu_refcnt = NULL;
6767
6768        /*  Compatibility with error handling in drivers */
6769        if (dev->reg_state == NETREG_UNINITIALIZED) {
6770                netdev_freemem(dev);
6771                return;
6772        }
6773
6774        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6775        dev->reg_state = NETREG_RELEASED;
6776
6777        /* will free via device release */
6778        put_device(&dev->dev);
6779}
6780EXPORT_SYMBOL(free_netdev);
6781
6782/**
6783 *      synchronize_net -  Synchronize with packet receive processing
6784 *
6785 *      Wait for packets currently being received to be done.
6786 *      Does not block later packets from starting.
6787 */
6788void synchronize_net(void)
6789{
6790        might_sleep();
6791        if (rtnl_is_locked())
6792                synchronize_rcu_expedited();
6793        else
6794                synchronize_rcu();
6795}
6796EXPORT_SYMBOL(synchronize_net);
6797
6798/**
6799 *      unregister_netdevice_queue - remove device from the kernel
6800 *      @dev: device
6801 *      @head: list
6802 *
6803 *      This function shuts down a device interface and removes it
6804 *      from the kernel tables.
6805 *      If head not NULL, device is queued to be unregistered later.
6806 *
6807 *      Callers must hold the rtnl semaphore.  You may want
6808 *      unregister_netdev() instead of this.
6809 */
6810
6811void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6812{
6813        ASSERT_RTNL();
6814
6815        if (head) {
6816                list_move_tail(&dev->unreg_list, head);
6817        } else {
6818                rollback_registered(dev);
6819                /* Finish processing unregister after unlock */
6820                net_set_todo(dev);
6821        }
6822}
6823EXPORT_SYMBOL(unregister_netdevice_queue);
6824
6825/**
6826 *      unregister_netdevice_many - unregister many devices
6827 *      @head: list of devices
6828 *
6829 *  Note: As most callers use a stack allocated list_head,
6830 *  we force a list_del() to make sure stack wont be corrupted later.
6831 */
6832void unregister_netdevice_many(struct list_head *head)
6833{
6834        struct net_device *dev;
6835
6836        if (!list_empty(head)) {
6837                rollback_registered_many(head);
6838                list_for_each_entry(dev, head, unreg_list)
6839                        net_set_todo(dev);
6840                list_del(head);
6841        }
6842}
6843EXPORT_SYMBOL(unregister_netdevice_many);
6844
6845/**
6846 *      unregister_netdev - remove device from the kernel
6847 *      @dev: device
6848 *
6849 *      This function shuts down a device interface and removes it
6850 *      from the kernel tables.
6851 *
6852 *      This is just a wrapper for unregister_netdevice that takes
6853 *      the rtnl semaphore.  In general you want to use this and not
6854 *      unregister_netdevice.
6855 */
6856void unregister_netdev(struct net_device *dev)
6857{
6858        rtnl_lock();
6859        unregister_netdevice(dev);
6860        rtnl_unlock();
6861}
6862EXPORT_SYMBOL(unregister_netdev);
6863
6864/**
6865 *      dev_change_net_namespace - move device to different nethost namespace
6866 *      @dev: device
6867 *      @net: network namespace
6868 *      @pat: If not NULL name pattern to try if the current device name
6869 *            is already taken in the destination network namespace.
6870 *
6871 *      This function shuts down a device interface and moves it
6872 *      to a new network namespace. On success 0 is returned, on
6873 *      a failure a netagive errno code is returned.
6874 *
6875 *      Callers must hold the rtnl semaphore.
6876 */
6877
6878int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6879{
6880        int err;
6881
6882        ASSERT_RTNL();
6883
6884        /* Don't allow namespace local devices to be moved. */
6885        err = -EINVAL;
6886        if (dev->features & NETIF_F_NETNS_LOCAL)
6887                goto out;
6888
6889        /* Ensure the device has been registrered */
6890        if (dev->reg_state != NETREG_REGISTERED)
6891                goto out;
6892
6893        /* Get out if there is nothing todo */
6894        err = 0;
6895        if (net_eq(dev_net(dev), net))
6896                goto out;
6897
6898        /* Pick the destination device name, and ensure
6899         * we can use it in the destination network namespace.
6900         */
6901        err = -EEXIST;
6902        if (__dev_get_by_name(net, dev->name)) {
6903                /* We get here if we can't use the current device name */
6904                if (!pat)
6905                        goto out;
6906                if (dev_get_valid_name(net, dev, pat) < 0)
6907                        goto out;
6908        }
6909
6910        /*
6911         * And now a mini version of register_netdevice unregister_netdevice.
6912         */
6913
6914        /* If device is running close it first. */
6915        dev_close(dev);
6916
6917        /* And unlink it from device chain */
6918        err = -ENODEV;
6919        unlist_netdevice(dev);
6920
6921        synchronize_net();
6922
6923        /* Shutdown queueing discipline. */
6924        dev_shutdown(dev);
6925
6926        /* Notify protocols, that we are about to destroy
6927           this device. They should clean all the things.
6928
6929           Note that dev->reg_state stays at NETREG_REGISTERED.
6930           This is wanted because this way 8021q and macvlan know
6931           the device is just moving and can keep their slaves up.
6932        */
6933        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6934        rcu_barrier();
6935        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6936        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6937
6938        /*
6939         *      Flush the unicast and multicast chains
6940         */
6941        dev_uc_flush(dev);
6942        dev_mc_flush(dev);
6943
6944        /* Send a netdev-removed uevent to the old namespace */
6945        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6946        netdev_adjacent_del_links(dev);
6947
6948        /* Actually switch the network namespace */
6949        dev_net_set(dev, net);
6950
6951        /* If there is an ifindex conflict assign a new one */
6952        if (__dev_get_by_index(net, dev->ifindex)) {
6953                int iflink = (dev->iflink == dev->ifindex);
6954                dev->ifindex = dev_new_index(net);
6955                if (iflink)
6956                        dev->iflink = dev->ifindex;
6957        }
6958
6959        /* Send a netdev-add uevent to the new namespace */
6960        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6961        netdev_adjacent_add_links(dev);
6962
6963        /* Fixup kobjects */
6964        err = device_rename(&dev->dev, dev->name);
6965        WARN_ON(err);
6966
6967        /* Add the device back in the hashes */
6968        list_netdevice(dev);
6969
6970        /* Notify protocols, that a new device appeared. */
6971        call_netdevice_notifiers(NETDEV_REGISTER, dev);
6972
6973        /*
6974         *      Prevent userspace races by waiting until the network
6975         *      device is fully setup before sending notifications.
6976         */
6977        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6978
6979        synchronize_net();
6980        err = 0;
6981out:
6982        return err;
6983}
6984EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6985
6986static int dev_cpu_callback(struct notifier_block *nfb,
6987                            unsigned long action,
6988                            void *ocpu)
6989{
6990        struct sk_buff **list_skb;
6991        struct sk_buff *skb;
6992        unsigned int cpu, oldcpu = (unsigned long)ocpu;
6993        struct softnet_data *sd, *oldsd;
6994
6995        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6996                return NOTIFY_OK;
6997
6998        local_irq_disable();
6999        cpu = smp_processor_id();
7000        sd = &per_cpu(softnet_data, cpu);
7001        oldsd = &per_cpu(softnet_data, oldcpu);
7002
7003        /* Find end of our completion_queue. */
7004        list_skb = &sd->completion_queue;
7005        while (*list_skb)
7006                list_skb = &(*list_skb)->next;
7007        /* Append completion queue from offline CPU. */
7008        *list_skb = oldsd->completion_queue;
7009        oldsd->completion_queue = NULL;
7010
7011        /* Append output queue from offline CPU. */
7012        if (oldsd->output_queue) {
7013                *sd->output_queue_tailp = oldsd->output_queue;
7014                sd->output_queue_tailp = oldsd->output_queue_tailp;
7015                oldsd->output_queue = NULL;
7016                oldsd->output_queue_tailp = &oldsd->output_queue;
7017        }
7018        /* Append NAPI poll list from offline CPU, with one exception :
7019         * process_backlog() must be called by cpu owning percpu backlog.
7020         * We properly handle process_queue & input_pkt_queue later.
7021         */
7022        while (!list_empty(&oldsd->poll_list)) {
7023                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7024                                                            struct napi_struct,
7025                                                            poll_list);
7026
7027                list_del_init(&napi->poll_list);
7028                if (napi->poll == process_backlog)
7029                        napi->state = 0;
7030                else
7031                        ____napi_schedule(sd, napi);
7032        }
7033
7034        raise_softirq_irqoff(NET_TX_SOFTIRQ);
7035        local_irq_enable();
7036
7037        /* Process offline CPU's input_pkt_queue */
7038        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7039                netif_rx_internal(skb);
7040                input_queue_head_incr(oldsd);
7041        }
7042        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7043                netif_rx_internal(skb);
7044                input_queue_head_incr(oldsd);
7045        }
7046
7047        return NOTIFY_OK;
7048}
7049
7050
7051/**
7052 *      netdev_increment_features - increment feature set by one
7053 *      @all: current feature set
7054 *      @one: new feature set
7055 *      @mask: mask feature set
7056 *
7057 *      Computes a new feature set after adding a device with feature set
7058 *      @one to the master device with current feature set @all.  Will not
7059 *      enable anything that is off in @mask. Returns the new feature set.
7060 */
7061netdev_features_t netdev_increment_features(netdev_features_t all,
7062        netdev_features_t one, netdev_features_t mask)
7063{
7064        if (mask & NETIF_F_GEN_CSUM)
7065                mask |= NETIF_F_ALL_CSUM;
7066        mask |= NETIF_F_VLAN_CHALLENGED;
7067
7068        all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7069        all &= one | ~NETIF_F_ALL_FOR_ALL;
7070
7071        /* If one device supports hw checksumming, set for all. */
7072        if (all & NETIF_F_GEN_CSUM)
7073                all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7074
7075        return all;
7076}
7077EXPORT_SYMBOL(netdev_increment_features);
7078
7079static struct hlist_head * __net_init netdev_create_hash(void)
7080{
7081        int i;
7082        struct hlist_head *hash;
7083
7084        hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7085        if (hash != NULL)
7086                for (i = 0; i < NETDEV_HASHENTRIES; i++)
7087                        INIT_HLIST_HEAD(&hash[i]);
7088
7089        return hash;
7090}
7091
7092/* Initialize per network namespace state */
7093static int __net_init netdev_init(struct net *net)
7094{
7095        if (net != &init_net)
7096                INIT_LIST_HEAD(&net->dev_base_head);
7097
7098        net->dev_name_head = netdev_create_hash();
7099        if (net->dev_name_head == NULL)
7100                goto err_name;
7101
7102        net->dev_index_head = netdev_create_hash();
7103        if (net->dev_index_head == NULL)
7104                goto err_idx;
7105
7106        return 0;
7107
7108err_idx:
7109        kfree(net->dev_name_head);
7110err_name:
7111        return -ENOMEM;
7112}
7113
7114/**
7115 *      netdev_drivername - network driver for the device
7116 *      @dev: network device
7117 *
7118 *      Determine network driver for device.
7119 */
7120const char *netdev_drivername(const struct net_device *dev)
7121{
7122        const struct device_driver *driver;
7123        const struct device *parent;
7124        const char *empty = "";
7125
7126        parent = dev->dev.parent;
7127        if (!parent)
7128                return empty;
7129
7130        driver = parent->driver;
7131        if (driver && driver->name)
7132                return driver->name;
7133        return empty;
7134}
7135
7136static void __netdev_printk(const char *level, const struct net_device *dev,
7137                            struct va_format *vaf)
7138{
7139        if (dev && dev->dev.parent) {
7140                dev_printk_emit(level[1] - '0',
7141                                dev->dev.parent,
7142                                "%s %s %s%s: %pV",
7143                                dev_driver_string(dev->dev.parent),
7144                                dev_name(dev->dev.parent),
7145                                netdev_name(dev), netdev_reg_state(dev),
7146                                vaf);
7147        } else if (dev) {
7148                printk("%s%s%s: %pV",
7149                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7150        } else {
7151                printk("%s(NULL net_device): %pV", level, vaf);
7152        }
7153}
7154
7155void netdev_printk(const char *level, const struct net_device *dev,
7156                   const char *format, ...)
7157{
7158        struct va_format vaf;
7159        va_list args;
7160
7161        va_start(args, format);
7162
7163        vaf.fmt = format;
7164        vaf.va = &args;
7165
7166        __netdev_printk(level, dev, &vaf);
7167
7168        va_end(args);
7169}
7170EXPORT_SYMBOL(netdev_printk);
7171
7172#define define_netdev_printk_level(func, level)                 \
7173void func(const struct net_device *dev, const char *fmt, ...)   \
7174{                                                               \
7175        struct va_format vaf;                                   \
7176        va_list args;                                           \
7177                                                                \
7178        va_start(args, fmt);                                    \
7179                                                                \
7180        vaf.fmt = fmt;                                          \
7181        vaf.va = &args;                                         \
7182                                                                \
7183        __netdev_printk(level, dev, &vaf);                      \
7184                                                                \
7185        va_end(args);                                           \
7186}                                                               \
7187EXPORT_SYMBOL(func);
7188
7189define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7190define_netdev_printk_level(netdev_alert, KERN_ALERT);
7191define_netdev_printk_level(netdev_crit, KERN_CRIT);
7192define_netdev_printk_level(netdev_err, KERN_ERR);
7193define_netdev_printk_level(netdev_warn, KERN_WARNING);
7194define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7195define_netdev_printk_level(netdev_info, KERN_INFO);
7196
7197static void __net_exit netdev_exit(struct net *net)
7198{
7199        kfree(net->dev_name_head);
7200        kfree(net->dev_index_head);
7201}
7202
7203static struct pernet_operations __net_initdata netdev_net_ops = {
7204        .init = netdev_init,
7205        .exit = netdev_exit,
7206};
7207
7208static void __net_exit default_device_exit(struct net *net)
7209{
7210        struct net_device *dev, *aux;
7211        /*
7212         * Push all migratable network devices back to the
7213         * initial network namespace
7214         */
7215        rtnl_lock();
7216        for_each_netdev_safe(net, dev, aux) {
7217                int err;
7218                char fb_name[IFNAMSIZ];
7219
7220                /* Ignore unmoveable devices (i.e. loopback) */
7221                if (dev->features & NETIF_F_NETNS_LOCAL)
7222                        continue;
7223
7224                /* Leave virtual devices for the generic cleanup */
7225                if (dev->rtnl_link_ops)
7226                        continue;
7227
7228                /* Push remaining network devices to init_net */
7229                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7230                err = dev_change_net_namespace(dev, &init_net, fb_name);
7231                if (err) {
7232                        pr_emerg("%s: failed to move %s to init_net: %d\n",
7233                                 __func__, dev->name, err);
7234                        BUG();
7235                }
7236        }
7237        rtnl_unlock();
7238}
7239
7240static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7241{
7242        /* Return with the rtnl_lock held when there are no network
7243         * devices unregistering in any network namespace in net_list.
7244         */
7245        struct net *net;
7246        bool unregistering;
7247        DEFINE_WAIT(wait);
7248
7249        for (;;) {
7250                prepare_to_wait(&netdev_unregistering_wq, &wait,
7251                                TASK_UNINTERRUPTIBLE);
7252                unregistering = false;
7253                rtnl_lock();
7254                list_for_each_entry(net, net_list, exit_list) {
7255                        if (net->dev_unreg_count > 0) {
7256                                unregistering = true;
7257                                break;
7258                        }
7259                }
7260                if (!unregistering)
7261                        break;
7262                __rtnl_unlock();
7263                schedule();
7264        }
7265        finish_wait(&netdev_unregistering_wq, &wait);
7266}
7267
7268static void __net_exit default_device_exit_batch(struct list_head *net_list)
7269{
7270        /* At exit all network devices most be removed from a network
7271         * namespace.  Do this in the reverse order of registration.
7272         * Do this across as many network namespaces as possible to
7273         * improve batching efficiency.
7274         */
7275        struct net_device *dev;
7276        struct net *net;
7277        LIST_HEAD(dev_kill_list);
7278
7279        /* To prevent network device cleanup code from dereferencing
7280         * loopback devices or network devices that have been freed
7281         * wait here for all pending unregistrations to complete,
7282         * before unregistring the loopback device and allowing the
7283         * network namespace be freed.
7284         *
7285         * The netdev todo list containing all network devices
7286         * unregistrations that happen in default_device_exit_batch
7287         * will run in the rtnl_unlock() at the end of
7288         * default_device_exit_batch.
7289         */
7290        rtnl_lock_unregistering(net_list);
7291        list_for_each_entry(net, net_list, exit_list) {
7292                for_each_netdev_reverse(net, dev) {
7293                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7294                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7295                        else
7296                                unregister_netdevice_queue(dev, &dev_kill_list);
7297                }
7298        }
7299        unregister_netdevice_many(&dev_kill_list);
7300        rtnl_unlock();
7301}
7302
7303static struct pernet_operations __net_initdata default_device_ops = {
7304        .exit = default_device_exit,
7305        .exit_batch = default_device_exit_batch,
7306};
7307
7308/*
7309 *      Initialize the DEV module. At boot time this walks the device list and
7310 *      unhooks any devices that fail to initialise (normally hardware not
7311 *      present) and leaves us with a valid list of present and active devices.
7312 *
7313 */
7314
7315/*
7316 *       This is called single threaded during boot, so no need
7317 *       to take the rtnl semaphore.
7318 */
7319static int __init net_dev_init(void)
7320{
7321        int i, rc = -ENOMEM;
7322
7323        BUG_ON(!dev_boot_phase);
7324
7325        if (dev_proc_init())
7326                goto out;
7327
7328        if (netdev_kobject_init())
7329                goto out;
7330
7331        INIT_LIST_HEAD(&ptype_all);
7332        for (i = 0; i < PTYPE_HASH_SIZE; i++)
7333                INIT_LIST_HEAD(&ptype_base[i]);
7334
7335        INIT_LIST_HEAD(&offload_base);
7336
7337        if (register_pernet_subsys(&netdev_net_ops))
7338                goto out;
7339
7340        /*
7341         *      Initialise the packet receive queues.
7342         */
7343
7344        for_each_possible_cpu(i) {
7345                struct softnet_data *sd = &per_cpu(softnet_data, i);
7346
7347                skb_queue_head_init(&sd->input_pkt_queue);
7348                skb_queue_head_init(&sd->process_queue);
7349                INIT_LIST_HEAD(&sd->poll_list);
7350                sd->output_queue_tailp = &sd->output_queue;
7351#ifdef CONFIG_RPS
7352                sd->csd.func = rps_trigger_softirq;
7353                sd->csd.info = sd;
7354                sd->cpu = i;
7355#endif
7356
7357                sd->backlog.poll = process_backlog;
7358                sd->backlog.weight = weight_p;
7359        }
7360
7361        dev_boot_phase = 0;
7362
7363        /* The loopback device is special if any other network devices
7364         * is present in a network namespace the loopback device must
7365         * be present. Since we now dynamically allocate and free the
7366         * loopback device ensure this invariant is maintained by
7367         * keeping the loopback device as the first device on the
7368         * list of network devices.  Ensuring the loopback devices
7369         * is the first device that appears and the last network device
7370         * that disappears.
7371         */
7372        if (register_pernet_device(&loopback_net_ops))
7373                goto out;
7374
7375        if (register_pernet_device(&default_device_ops))
7376                goto out;
7377
7378        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7379        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7380
7381        hotcpu_notifier(dev_cpu_callback, 0);
7382        dst_init();
7383        rc = 0;
7384out:
7385        return rc;
7386}
7387
7388subsys_initcall(net_dev_init);
Note: See TracBrowser for help on using the repository browser.