歡迎您光臨本站 註冊首頁

Linux內核bridge中的數據包處理流程

←手機掃碼閱讀     火星人 @ 2014-03-26 , reply:0

1. 前言

本文簡要介紹數據包在進入橋網卡后在Linux網路協議棧的處理流程,並描述netfilter的hook點的掛接處理情況,具體各部分的詳細處理待後續文章中說明。

以下內核代碼版本為2.6.19.2.

2. 函數處理流程
bridge入口點handle_bridge()
/* net/core/dev.c */

int netif_receive_skb(struct sk_buff *skb)
{
......
if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
goto out;
......
}

bridge基本掛接點處理函數:br_handle_frame_hook()

static __inline__ int handle_bridge(struct sk_buff **pskb,
struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev)
{
struct net_bridge_port *port;
if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
(port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
return 0;
if (*pt_prev) {
*ret = deliver_skb(*pskb, *pt_prev, orig_dev);
*pt_prev = NULL;
}

return br_handle_frame_hook(port, pskb);
}

bridge_handle_frame_hook()的實際實現:
/* net/bridge/br.c */
static int __init br_init(void)
{
......
br_handle_frame_hook = br_handle_frame;
......
}

br_handle_frame: PF_BEIDGE的prerouting點
/* net/bridge/br_input.c */
int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto err;
if (unlikely(is_link_local(dest))) {
// 自身包進入PF_BEIDGE的INPUT點, 一般處理的包數不多
skb->pkt_type = PACKET_HOST;
// 正常是返回1的, 然後就返回1, 表示橋模塊全權處理該包了
return NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
NULL, br_handle_local_finish) != 0;
}
if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) {
// br_should_route_hook函數一般沒定義
if (br_should_route_hook) {
if (br_should_route_hook(pskb))
return 0;
skb = *pskb;
dest = eth_hdr(skb)->h_dest;
}
if (!compare_ether_addr(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
// PF_BRIDGE的prerouting處理結束後進入br_handle_frame_finish
NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
br_handle_frame_finish);
// 處理后始終返回1, 表示不再進行其他協議族處理,該數據包已經完全由bridge處理完畢
return 1;
}
err:
kfree_skb(skb);
// 處理后始終返回1, 表示不再進行其他協議族處理,該數據包已經完全由bridge處理完畢
return 1;
}
通過br_handle_frame_finish進入bridge的轉發:
/* note: already called with rcu_read_lock (preempt_disabled) */
int br_handle_frame_finish(struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
int passedup = 0;
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
br_fdb_update(br, p, eth_hdr(skb)->h_source);
if (p->state == BR_STATE_LEARNING)
goto drop;
if (br->dev->flags & IFF_PROMISC) {
struct sk_buff *skb2;
skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2 != NULL) {
passedup = 1;
br_pass_frame_up(br, skb2);
}
}
if (is_multicast_ether_addr(dest)) {
// 多播轉發,也是調用廣播處理
br->statistics.multicast++;
br_flood_forward(br, skb, !passedup);
if (!passedup)
br_pass_frame_up(br, skb);
goto out;
}
// 根據目的MAC找目的出口
dst = __br_fdb_get(br, dest);
if (dst != NULL && dst->is_local) {
if (!passedup)
br_pass_frame_up(br, skb);
else
kfree_skb(skb);
goto out;
}
if (dst != NULL) {
// 單播轉發
br_forward(dst->dst, skb);
goto out;
}
// 廣播轉發
br_flood_forward(br, skb, 0);
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
廣播/多播轉發: br_flood_forward/br_flood
/* called under bridge lock */
void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, int clone)
{
br_flood(br, skb, clone, __br_forward);
}
/* called under bridge lock */
static void br_flood(struct net_bridge *br, struct sk_buff *skb, int clone,
void (*__packet_hook)(const struct net_bridge_port *p,
struct sk_buff *skb))
{
struct net_bridge_port *p;
struct net_bridge_port *prev;
if (clone) {
struct sk_buff *skb2;
if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
br->statistics.tx_dropped++;
return;
}
skb = skb2;
}
prev = NULL;
list_for_each_entry_rcu(p, &br->port_list, list) {
if (should_deliver(p, skb)) {
if (prev != NULL) {
struct sk_buff *skb2;
if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
br->statistics.tx_dropped++;
kfree_skb(skb);
return;
}
// 這裡實際是__br_forward
__packet_hook(prev, skb2);
}
prev = p;
}
}
if (prev != NULL) {
// 這裡實際是__br_forward
__packet_hook(prev, skb);
return;
}
kfree_skb(skb);
}

單播轉發: br_forward
/* net/bridge/br_forward.c */
/* called with rcu_read_lock */
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
if (should_deliver(to, skb)) {
// 也是調用__br_forward
__br_forward(to, skb);
return;
}
kfree_skb(skb);
}
FORWARD點:
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
struct net_device *indev;
indev = skb->dev;
skb->dev = to->dev;
skb->ip_summed = CHECKSUM_NONE;
// 進入PF_BRIDGE的forward hook, 結束後進入br_forward_finish()
NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
br_forward_finish);
}

POSTROUTING點:
// 從FORWARD點處理后直接進入POSTROUTING點處理
int br_forward_finish(struct sk_buff *skb)
{
// 進入PF_BRIDGE的postrouting hook, 結束後進入br_dev_queue_push_xmit()
return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
}
數據包發出:
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
/* drop mtu oversized packets except gso */
if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
kfree_skb(skb);
else {
/* ip_refrag calls ip_fragment, doesn't copy the MAC header. */
if (nf_bridge_maybe_copy_header(skb))
kfree_skb(skb);
else {
skb_push(skb, ETH_HLEN);
// 此處調用dev設備的hard_start_xmit()函數
dev_queue_xmit(skb);
}
}
return 0;
}
橋網卡設備的hard_start_xmit()函數定義為:
/* net/bridge/br_device.c */
void br_dev_setup(struct net_device *dev)
{
......
dev->hard_start_xmit = br_dev_xmit;
......
}
/* net device transmit always called with no BH (preempt_disabled) */
int br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
const unsigned char *dest = skb->data;
struct net_bridge_fdb_entry *dst;
br->statistics.tx_packets++;
br->statistics.tx_bytes += skb->len;
skb->mac.raw = skb->data;
skb_pull(skb, ETH_HLEN);
if (dest[0] & 1)
// 多播發送
br_flood_deliver(br, skb, 0);
else if ((dst = __br_fdb_get(br, dest)) != NULL)
// 單播發送
br_deliver(dst->dst, skb);
else
// 廣播發送
br_flood_deliver(br, skb, 0);
// 這些發送函數最終都會調用__br_deliver()函數
return 0;
}

/* net/bridge/br_forward.c */
static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
{
skb->dev = to->dev;
// 此處是PF_BRIDGE的OUTPUT點
NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
br_forward_finish);
}

總結: PF_BRIDGE中的各個hook點和PF_INET不同, 可用下面的圖表示:

PREROUTING --+--FORWARD-----POSTROUTING------+----OUTPUT
| |
| |
INPUT

3. BF_BRIDGE的hook點

在net/bridge/br_netfilter.c中定義了以下hook點,注意這些hook點主要是PF_BRIDGE協議族的。
/* net/bridge/br_netfilter.c */
/* For br_nf_local_out we need (prio = NF_BR_PRI_FIRST), to insure that innocent
* PF_BRIDGE/NF_BR_LOCAL_OUT functions don't get bridged traffic as input.
* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
* ip_refrag() can return NF_STOLEN. */
static struct nf_hook_ops br_nf_ops[] = {
// PF_BRIDGE的掛接點
// PREROUTING點
{ .hook = br_nf_pre_routing,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF, },
// INPUT點
{ .hook = br_nf_local_in,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_BR_PRI_BRNF, },
// FORWARD點
{ .hook = br_nf_forward_ip,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF - 1, },
// FORWARD點
{ .hook = br_nf_forward_arp,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF, },
// OUTPUT點
{ .hook = br_nf_local_out,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_FIRST, },
// POSTROUTING點
{ .hook = br_nf_post_routing,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_LAST, },
// 後面是PF_INET/PF_INET6的掛接點, 其實也沒進行什麼數據包操作,
// 就是自身的輸入輸出包不通過橋處理,要短路掉
{ .hook = ip_sabotage_in,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_PRE_ROUTING,
.priority = NF_IP_PRI_FIRST, },
{ .hook = ip_sabotage_in,
.owner = THIS_MODULE,
.pf = PF_INET6,
.hooknum = NF_IP6_PRE_ROUTING,
.priority = NF_IP6_PRI_FIRST, },
{ .hook = ip_sabotage_out,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_FORWARD,
.priority = NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD, },
{ .hook = ip_sabotage_out,
.owner = THIS_MODULE,
.pf = PF_INET6,
.hooknum = NF_IP6_FORWARD,
.priority = NF_IP6_PRI_BRIDGE_SABOTAGE_FORWARD, },
{ .hook = ip_sabotage_out,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_LOCAL_OUT,
.priority = NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT, },
{ .hook = ip_sabotage_out,
.owner = THIS_MODULE,
.pf = PF_INET6,
.hooknum = NF_IP6_LOCAL_OUT,
.priority = NF_IP6_PRI_BRIDGE_SABOTAGE_LOCAL_OUT, },
{ .hook = ip_sabotage_out,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_IP_POST_ROUTING,
.priority = NF_IP_PRI_FIRST, },
{ .hook = ip_sabotage_out,
.owner = THIS_MODULE,
.pf = PF_INET6,
.hooknum = NF_IP6_POST_ROUTING,
.priority = NF_IP6_PRI_FIRST, },
};

// PF_BRIDGE的PRROUTING點處理函數
static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
......
// 此處繼續調用PF_INET族的PREROUTING點的hook處理
NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
br_nf_pre_routing_finish);
return NF_STOLEN;
inhdr_error:
// IP_INC_STATS_BH(IpInHdrErrors);
out:
return NF_DROP;
}

// PF_BRIDGE的FORWARD點處理
static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
......
// 此處繼續調用PF_INET/PF_INET6族的FORWARD點的hook處理
NF_HOOK(pf, NF_IP_FORWARD, skb, bridge_parent(in), parent,
br_nf_forward_finish);
return NF_STOLEN;
}
// PF_BRIDGE的OUTPUT點處理
static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
......
/* IP forwarded traffic has a physindev, locally
* generated traffic hasn't. */
if (realindev != NULL) {
if (!(nf_bridge->mask & BRNF_DONT_TAKE_PARENT)) {
struct net_device *parent = bridge_parent(realindev);
if (parent)
realindev = parent;
}
// 此處繼續調用PF_INET/PF_INET6族的FORWARD點的hook處理, 不過優先權值要在//
NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1以上
NF_HOOK_THRESH(pf, NF_IP_FORWARD, skb, realindev,
realoutdev, br_nf_local_out_finish,
NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1);
} else {
// 此處繼續調用PF_INET/PF_INET6族的FORWARD點的hook處理, 不過優先權值要在
// NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1以上
NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev,
realoutdev, br_nf_local_out_finish,
NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1);
}
out:
return NF_STOLEN;
}

// PF_BRIDGE的POSTROUTING點
static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
......
// 此處繼續調用PF_INET/PF_INET6族的POSTROUTING點的hook處理
NF_HOOK(pf, NF_IP_POST_ROUTING, skb, NULL, realoutdev,
br_nf_dev_queue_xmit);
return NF_STOLEN;
#ifdef CONFIG_NETFILTER_DEBUG
print_error:
if (skb->dev != NULL) {
printk("[%s]", skb->dev->name);
if (realoutdev)
printk("[%s]", realoutdev->name);
}
printk(" head:%p, raw:%p, data:%p\n", skb->head, skb->mac.raw,
skb->data);
dump_stack();
return NF_ACCEPT;
#endif
}

由此可見, PF_INET的各個hook點也被PF_BRIDGE的各個hook點調用,因此可以在橋網卡中進行過濾或NAT等操作。

4. 結論

BRIDGE的數據處理流程是是一個獨立的處理過程, 如果處理正常的話就不再返回到其他協議處理。
在橋的處理層次也和IP協議一樣,可以掛接多個PF_BRIDGE的掛接點,這些掛接點中又調用了PF_INET族的掛接點,從而實現了橋下的過濾、NAT等功能。

[火星人 ] Linux內核bridge中的數據包處理流程已經有1259次圍觀

http://coctec.com/docs/linux/show-post-188856.html