前分析過(guò)linux kernel 2.6.32的bridge轉(zhuǎn)發(fā)邏輯,下面分析一下linux kernel 3.10的bridge轉(zhuǎn)發(fā)邏輯。這樣正是CentOS 5和CentOS 7對(duì)應(yīng)的內(nèi)核。3.10 kernel中bridge邏輯的最大改變就是增加了vlan處理邏輯以及brdige入口函數(shù)的設(shè)置。
1. netdev_rx_handler_register
在分析之前首先要介紹一個(gè)重要函數(shù):netdev_rx_handler_register,這個(gè)函數(shù)是2.6內(nèi)核所沒(méi)有的。
netdev_rx_handler_register
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/*
* dev: 要注冊(cè)接收函數(shù)的dev
* rx_handler: 要注冊(cè)的接收函數(shù)
* rx_handler_data: 指向rx_handler_data使用的數(shù)據(jù)
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
ASSERT_RTNL();
if (dev->rx_handler)
return -EBUSY;
/* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
return 0;
}
這個(gè)函數(shù)可以給設(shè)備(net_device)注冊(cè)接收函數(shù),然后在__netif_receive_skb函數(shù)中根據(jù)接收skb的設(shè)備接口,再調(diào)用這個(gè)被注冊(cè)的接收函數(shù)。比如為網(wǎng)橋下的接口注冊(cè)br_handle_frame函數(shù),為bonding接口注冊(cè)bond_handle_frame函數(shù)。這相對(duì)于老式的網(wǎng)橋處理更靈活,有了這個(gè)機(jī)制也可以在模塊中自行注冊(cè)處理函數(shù)。比如3.10中的openvswitch(OpenvSwitch在3.10已經(jīng)合入了內(nèi)核)創(chuàng)建netdev vport的函數(shù)netdev_create。
netdev_create
1
2
3
4
5
6
7
static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
/....../
err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,vport);
/....../
}
這個(gè)函數(shù)在創(chuàng)建netdev vport時(shí)將設(shè)備的接收函數(shù)設(shè)置為netdev_frame_hook函數(shù),這也是整個(gè)openvswitch的入口函數(shù),如果查看OpenvSwitch的源碼可以看到當(dāng)安裝于2.6內(nèi)核時(shí)這里是替換掉bridge的br_handle_frame_hook函數(shù),從而由bridge邏輯進(jìn)入OpenvSwitch邏輯。
2. Bridge轉(zhuǎn)發(fā)邏輯分析
還是先從netif_receive_skb函數(shù)分析,這個(gè)函數(shù)算是進(jìn)入?yún)f(xié)議棧的入口。
netif_receive_skb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int netif_receive_skb(struct sk_buff *skb)
{
int ret;
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
rcu_read_lock();
/*RPS邏輯處理,現(xiàn)在內(nèi)核中使用了RPS機(jī)制, 將報(bào)文分散到各個(gè)cpu的接收隊(duì)列中進(jìn)行負(fù)載均衡處理*/
#ifdef CONFIG_RPS
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
return ret;
}
}
#endif
ret = __netif_receive_skb(skb);
rcu_read_unlock();
return ret;
}
netif_receive_skb只是對(duì)數(shù)據(jù)包進(jìn)行了RPS的處理,然后調(diào)用__netif_receive_skb。
__netif_receive_skb并沒(méi)有其他多余的處理邏輯,主要調(diào)用 __netif_receive_skb_core,這個(gè)函數(shù)才真正相當(dāng)于2.6內(nèi)核的netif_receive_skb。以下代碼省略了和bridge無(wú)關(guān)的邏輯。
__netif_receive_skb_core
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
struct net_device *null_or_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
/*......*/
orig_dev = skb->dev;
skb_reset_network_header(skb);
pt_prev = NULL;
skb->skb_iif = skb->dev->ifindex;
/*ptype_all協(xié)議處理,tcpdump抓包就在這里*/
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
/*調(diào)用接收設(shè)備的rx_handler*/
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
/*根據(jù) skb->protocol傳遞給上層協(xié)議*/
type = skb->protocol;
list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type && (ptype->dev == null_or_dev || ptype->dev == skb->dev ||ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
goto drop;
else
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
drop:
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
ret = NET_RX_DROP;
}
out:
return ret;
}
如果一個(gè)dev被添加到一個(gè)bridge(做為bridge的一個(gè)接口),的這個(gè)接口設(shè)備的rx_handler被設(shè)置為br_handle_frame函數(shù),這是在br_add_if函數(shù)中設(shè)置的,而br_add_if (net/bridge/br_if.c)是在向網(wǎng)橋設(shè)備上添加接口時(shí)設(shè)置的。進(jìn)入br_handle_frame也就進(jìn)入了bridge的邏輯代碼。
br_add_if
1
2
3
4
5
6
int br_add_if(struct net_bridge *br, struct net_device *dev)
{
/*......*/
err = netdev_rx_handler_register(dev, br_handle_frame, p);
/*......*/
}
br_handle_frame
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
br_should_route_hook_t *rhook;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto drop;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return RX_HANDLER_CONSUMED;
/*獲取dev對(duì)應(yīng)的bridge port*/
p = br_port_get_rcu(skb->dev);
/*特殊目的mac地址的處理*/
if (unlikely(is_link_local_ether_addr(dest))) {
/*
* See IEEE 802.1D Table 7-10 Reserved addresses
*
* Assignment Value
* Bridge Group Address 01-80-C2-00-00-00
* (MAC Control) 802.3 01-80-C2-00-00-01
* (Link Aggregation) 802.3 01-80-C2-00-00-02
* 802.1X PAE address 01-80-C2-00-00-03
*
* 802.1AB LLDP 01-80-C2-00-00-0E
*
* Others reserved for future standardization
*/
switch (dest[5]) {
case 0x00: /* Bridge Group Address */
/* If STP is turned off,then must forward to keep loop detection */
if (p->br->stp_enabled == BR_NO_STP)
goto forward;
break;
case 0x01: /* IEEE MAC (Pause) */
goto drop;
default:
/* Allow selective forwarding for most other protocols */
if (p->br->group_fwd_mask & (1u << dest[5]))
goto forward;
}
/* LOCAL_IN hook點(diǎn),注意經(jīng)過(guò)這個(gè)hook點(diǎn)并不代表發(fā)送到主機(jī)協(xié)議棧(只有特殊目的mac 01-80-C2才會(huì)走到這里)*/
if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
NULL, br_handle_local_finish)) {
return RX_HANDLER_CONSUMED; /* consumed by filter */
} else {
*pskb = skb;
return RX_HANDLER_PASS; /* continue processing */
}
}
/*轉(zhuǎn)發(fā)邏輯*/
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
rhook = rcu_dereference(br_should_route_hook);
if (rhook) {
if ((*rhook)(skb)) {
*pskb = skb;
return RX_HANDLER_PASS;
}
dest = eth_hdr(skb)->h_dest;
}
/* fall through */
case BR_STATE_LEARNING:
/*skb的目的mac和bridge的mac一樣,則將skb發(fā)往本機(jī)協(xié)議棧*/
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
/*NF_BR_PRE_ROUTING hook點(diǎn)*/
NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,br_handle_frame_finish);
break;
default:
drop:
kfree_skb(skb);
}
return RX_HANDLER_CONSUMED;
}
經(jīng)過(guò)NF_BR_LOCAL_IN hook點(diǎn)會(huì)執(zhí)行br_handle_local_finish函數(shù)。
br_handle_local_finish
1
2
3
4
5
6
7
8
9
10
static int br_handle_local_finish(struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
u16 vid = 0;
/*獲取skb的vlan id(3.10的bridge支持vlan)*/
br_vlan_get_tag(skb, &vid);
/*更新bridge的mac表,注意vlan id也是參數(shù),說(shuō)明每個(gè)vlan有一個(gè)獨(dú)立的mac表*/
br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid);
return 0; /* process further */
}
經(jīng)過(guò)NF_BR_PRE_ROUTING hook點(diǎn)會(huì)執(zhí)行br_handle_frame_finish函數(shù)。
br_handle_frame_finish
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
int br_handle_frame_finish(struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
struct sk_buff *skb2;
u16 vid = 0;
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
/*這個(gè)判斷主要是vlan的相關(guān)檢查,如是否和接收接口配置的vlan相同*/
if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
goto out;
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
/*更新轉(zhuǎn)發(fā)數(shù)據(jù)庫(kù)*/
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid);
/*多播mac的處理*/
if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
br_multicast_rcv(br, p, skb))
goto drop;
if (p->state == BR_STATE_LEARNING)
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
/* The packet skb2 goes to the local host (NULL to skip). */
skb2 = NULL;
/*如果網(wǎng)橋被設(shè)置為混雜模式*/
if (br->dev->flags & IFF_PROMISC)
skb2 = skb;
dst = NULL;
/*如果skb的目的mac是廣播*/
if (is_broadcast_ether_addr(dest))
skb2 = skb;
else if (is_multicast_ether_addr(dest)) { /*多播*/
mdst = br_mdb_get(br, skb, vid);
if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
if ((mdst && mdst->mglist) ||
br_multicast_is_router(br))
skb2 = skb;
br_multicast_forward(mdst, skb, skb2);
skb = NULL;
if (!skb2)
goto out;
} else
skb2 = skb;
br->dev->stats.multicast++;
} else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) {/*目的地址是本機(jī)mac,則發(fā)往本機(jī)協(xié)議棧*/
skb2 = skb;
/* Do not forward the packet since it's local. */
skb = NULL;
}
if (skb) {
if (dst) {
dst->used = jiffies;
br_forward(dst->dst, skb, skb2); //轉(zhuǎn)發(fā)給目的接口
} else
br_flood_forward(br, skb, skb2); //找不到目的接口則廣播
}
if (skb2)
return br_pass_frame_up(skb2); //發(fā)往本機(jī)協(xié)議棧
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
{C}
我們先看發(fā)往本機(jī)協(xié)議棧的函數(shù)br_pass_frame_up。
br_pass_frame_up
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static int br_pass_frame_up(struct sk_buff *skb)
{
struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
struct net_bridge *br = netdev_priv(brdev);
//更新統(tǒng)計(jì)計(jì)數(shù)(略)
/* Bridge is just like any other port. Make sure the
* packet is allowed except in promisc modue when someone
* may be running packet capture.
*/
if (!(brdev->flags & IFF_PROMISC) && !br_allowed_egress(br, br_get_vlan_info(br), skb)) {
kfree_skb(skb); //如果不是混雜模式且vlan處理不合要求則丟棄
return NET_RX_DROP;
}
//vlan處理邏輯
skb = br_handle_vlan(br, br_get_vlan_info(br), skb);
if (!skb)
return NET_RX_DROP;
indev = skb->dev;
skb->dev = brdev; //重點(diǎn),這里修改了skb->dev為bridge
//經(jīng)過(guò)NF_BR_LOCAL_IN再次進(jìn)入?yún)f(xié)議棧
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
netif_receive_skb);
}
{C}
再次進(jìn)入netif_receive_skb,由于skb-dev被設(shè)置成了bridge,而bridge設(shè)備的rx_handler函數(shù)是沒(méi)有被設(shè)置的,所以就不會(huì)再次進(jìn)入bridge邏輯,而直接進(jìn)入了主機(jī)上層協(xié)議棧。
下面看轉(zhuǎn)發(fā)邏輯,轉(zhuǎn)發(fā)邏輯主要在br_forward函數(shù)中,而br_forward主要調(diào)用__br_forward函數(shù)。
__br_forward
1
2
3
4
5
6
7
8
9
10
11
12
13
14
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
struct net_device *indev;
//vlan處理
skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
if (!skb)
return;
indev = skb->dev;
skb->dev = to->dev; //skb->dev設(shè)置為出口設(shè)備dev
skb_forward_csum(skb);
//經(jīng)過(guò)NF_BR_FORWARD hook點(diǎn),調(diào)用br_forward_finish
NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
br_forward_finish);
}
{C}
br_forward_finish
1
2
3
4
5
int br_forward_finish(struct sk_buff *skb)
{
//經(jīng)過(guò)NF_BR_POST_ROUTING hook點(diǎn),調(diào)用br_dev_queue_push_xmit
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, br_dev_queue_push_xmit);
}
{C}
br_dev_queue_push_xmit
1
2
3
4
5
6
7
8
9
10
11
12
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
/* ip_fragment doesn't copy the MAC header */
if (nf_bridge_maybe_copy_header(skb) || (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))) {
kfree_skb(skb);
} else {
skb_push(skb, ETH_HLEN);
br_drop_fake_rtable(skb);
dev_queue_xmit(skb); //發(fā)送到鏈路層
}
return 0;
}
Skb進(jìn)入dev_queue_xmit就會(huì)調(diào)用相應(yīng)設(shè)備驅(qū)動(dòng)的發(fā)送函數(shù)。也就出了bridge邏輯。所以整個(gè)3.10kernel的bridge轉(zhuǎn)發(fā)邏輯如下圖所示:
注意,和2.6kernel一樣,bridge的OUTPUT hook點(diǎn)在bridge dev的發(fā)送函數(shù)中,這里不再分析列出。
?
評(píng)論
查看更多