499 lines
15 KiB
C
499 lines
15 KiB
C
/*
|
|
* eBPF kernel space program part
|
|
*
|
|
* Toy eBPF program for demonstration purposes, some parts derived from
|
|
* kernel tree's samples/bpf/sockex2_kern.c example.
|
|
*
|
|
* More background on eBPF, kernel tree: Documentation/networking/filter.txt
|
|
*
|
|
* Note, this file is rather large, and most classifier and actions are
|
|
* likely smaller to accomplish one specific use-case and are tailored
|
|
* for high performance. For performance reasons, you might also have the
|
|
* classifier and action already merged inside the classifier.
|
|
*
|
|
* In order to show various features it serves as a bigger programming
|
|
* example, which you should feel free to rip apart and experiment with.
|
|
*
|
|
* Compilation, configuration example:
|
|
*
|
|
* Note: as long as the BPF backend in LLVM is still experimental,
|
|
* you need to build LLVM with LLVM with --enable-experimental-targets=BPF
|
|
* Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
|
|
* and you have libelf.h and gelf.h headers and can link tc against -lelf.
|
|
*
|
|
* In case you need to sync kernel headers, go to your kernel source tree:
|
|
* # make headers_install INSTALL_HDR_PATH=/usr/
|
|
*
|
|
* $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
|
|
* $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
|
|
* $ objdump -h bpf.o
|
|
* [...]
|
|
* 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3
|
|
* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
|
|
* 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3
|
|
* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
|
|
* 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3
|
|
* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
|
|
* 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2
|
|
* CONTENTS, ALLOC, LOAD, DATA
|
|
* 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0
|
|
* CONTENTS, ALLOC, LOAD, DATA
|
|
* [...]
|
|
* # echo 1 > /proc/sys/net/core/bpf_jit_enable
|
|
* $ gcc bpf_agent.c -o bpf_agent -Wall -O2
|
|
* # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal)
|
|
* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
|
|
* action bpf obj bpf.o sec action-mark \
|
|
* action bpf obj bpf.o sec action-rand ok
|
|
* # tc filter show dev em1
|
|
* filter parent 1: protocol all pref 49152 bpf
|
|
* filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
|
|
* action order 1: bpf bpf.o:[action-mark] default-action pipe
|
|
* index 52 ref 1 bind 1
|
|
*
|
|
* action order 2: bpf bpf.o:[action-rand] default-action pipe
|
|
* index 53 ref 1 bind 1
|
|
*
|
|
* action order 3: gact action pass
|
|
* random type none pass val 0
|
|
* index 38 ref 1 bind 1
|
|
*
|
|
* The same program can also be installed on ingress side (as opposed to above
|
|
* egress configuration), e.g.:
|
|
*
|
|
* # tc qdisc add dev em1 handle ffff: ingress
|
|
* # tc filter add dev em1 parent ffff: bpf obj ...
|
|
*
|
|
* Notes on BPF agent:
|
|
*
|
|
* In the above example, the bpf_agent creates the unix domain socket
|
|
* natively. "tc exec" can also spawn a shell and hold the socktes there:
|
|
*
|
|
* # tc exec bpf imp /tmp/bpf-uds
|
|
* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
|
|
* action bpf obj bpf.o sec action-mark \
|
|
* action bpf obj bpf.o sec action-rand ok
|
|
* sh-4.2# (shell spawned from tc exec)
|
|
* sh-4.2# bpf_agent
|
|
* [...]
|
|
*
|
|
* This will read out fds over environment and produce the same data dump
|
|
* as below. This has the advantage that the spawned shell owns the fds
|
|
* and thus if the agent is restarted, it can reattach to the same fds, also
|
|
* various programs can easily read/modify the data simultaneously from user
|
|
* space side.
|
|
*
|
|
* If the shell is unnecessary, the agent can also just be spawned directly
|
|
* via tc exec:
|
|
*
|
|
* # tc exec bpf imp /tmp/bpf-uds run bpf_agent
|
|
* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
|
|
* action bpf obj bpf.o sec action-mark \
|
|
* action bpf obj bpf.o sec action-rand ok
|
|
*
|
|
* BPF agent example output:
|
|
*
|
|
* ver: 1
|
|
* obj: bpf.o
|
|
* dev: 64770
|
|
* ino: 6045133
|
|
* maps: 3
|
|
* map0:
|
|
* `- fd: 4
|
|
* | serial: 1
|
|
* | type: 1
|
|
* | max elem: 256
|
|
* | size key: 1
|
|
* ` size val: 16
|
|
* map1:
|
|
* `- fd: 5
|
|
* | serial: 2
|
|
* | type: 1
|
|
* | max elem: 1024
|
|
* | size key: 4
|
|
* ` size val: 16
|
|
* map2:
|
|
* `- fd: 6
|
|
* | serial: 3
|
|
* | type: 2
|
|
* | max elem: 64
|
|
* | size key: 4
|
|
* ` size val: 8
|
|
* data, period: 5sec
|
|
* `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0
|
|
* | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0]
|
|
* ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0]
|
|
* data, period: 5sec
|
|
* `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1
|
|
* | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0]
|
|
* ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0]
|
|
* data, period: 5sec
|
|
* `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3
|
|
* | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0]
|
|
* ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0]
|
|
* [...]
|
|
*
|
|
* This now means, the below classifier and action pipeline has been loaded
|
|
* as eBPF bytecode into the kernel, the kernel has verified that the
|
|
* execution of the bytecode is "safe", and it has JITed the programs
|
|
* afterwards, so that upon invocation they're running on native speed. tc
|
|
* has transferred all map file descriptors to the bpf_agent via IPC and
|
|
* even after tc exits, the agent can read out or modify all map data.
|
|
*
|
|
* Note that the export to the uds is done only once in the classifier and
|
|
* not in the action. It's enough to export the (here) shared descriptors
|
|
* once.
|
|
*
|
|
* If you need to disassemble the generated JIT image (echo with 2), the
|
|
* kernel tree has under tools/net/ a small helper, you can invoke e.g.
|
|
* `bpf_jit_disasm -o`.
|
|
*
|
|
* Please find in the code below further comments.
|
|
*
|
|
* -- Happy eBPF hacking! ;)
|
|
*/
|
|
#include <stdint.h>
|
|
#include <stdbool.h>
|
|
#include <sys/types.h>
|
|
#include <sys/socket.h>
|
|
#include <asm/types.h>
|
|
#include <linux/in.h>
|
|
#include <linux/if.h>
|
|
#include <linux/if_ether.h>
|
|
#include <linux/ip.h>
|
|
#include <linux/ipv6.h>
|
|
#include <linux/if_tunnel.h>
|
|
#include <linux/filter.h>
|
|
#include <linux/bpf.h>
|
|
|
|
/* Common, shared definitions with ebpf_agent.c. */
|
|
#include "bpf_shared.h"
|
|
/* BPF helper functions for our example. */
|
|
#include "../../include/bpf_api.h"
|
|
|
|
/* Could be defined here as well, or included from the header. */
|
|
#define TC_ACT_UNSPEC (-1)
|
|
#define TC_ACT_OK 0
|
|
#define TC_ACT_RECLASSIFY 1
|
|
#define TC_ACT_SHOT 2
|
|
#define TC_ACT_PIPE 3
|
|
#define TC_ACT_STOLEN 4
|
|
#define TC_ACT_QUEUED 5
|
|
#define TC_ACT_REPEAT 6
|
|
|
|
/* Other, misc stuff. */
|
|
#define IP_MF 0x2000
|
|
#define IP_OFFSET 0x1FFF
|
|
|
|
/* eBPF map definitions, all placed in section "maps". */
|
|
struct bpf_elf_map __section("maps") map_proto = {
|
|
.type = BPF_MAP_TYPE_HASH,
|
|
.id = BPF_MAP_ID_PROTO,
|
|
.size_key = sizeof(uint8_t),
|
|
.size_value = sizeof(struct count_tuple),
|
|
.max_elem = 256,
|
|
};
|
|
|
|
struct bpf_elf_map __section("maps") map_queue = {
|
|
.type = BPF_MAP_TYPE_HASH,
|
|
.id = BPF_MAP_ID_QUEUE,
|
|
.size_key = sizeof(uint32_t),
|
|
.size_value = sizeof(struct count_queue),
|
|
.max_elem = 1024,
|
|
};
|
|
|
|
struct bpf_elf_map __section("maps") map_drops = {
|
|
.type = BPF_MAP_TYPE_ARRAY,
|
|
.id = BPF_MAP_ID_DROPS,
|
|
.size_key = sizeof(uint32_t),
|
|
.size_value = sizeof(long),
|
|
.max_elem = 64,
|
|
};
|
|
|
|
/* Helper functions and definitions for the flow dissector used by the
|
|
* example classifier. This resembles the kernel's flow dissector to
|
|
* some extend and is just used as an example to show what's possible
|
|
* with eBPF.
|
|
*/
|
|
struct sockaddr;
|
|
|
|
struct vlan_hdr {
|
|
__be16 h_vlan_TCI;
|
|
__be16 h_vlan_encapsulated_proto;
|
|
};
|
|
|
|
struct flow_keys {
|
|
__u32 src;
|
|
__u32 dst;
|
|
union {
|
|
__u32 ports;
|
|
__u16 port16[2];
|
|
};
|
|
__s32 th_off;
|
|
__u8 ip_proto;
|
|
};
|
|
|
|
static inline int flow_ports_offset(__u8 ip_proto)
|
|
{
|
|
switch (ip_proto) {
|
|
case IPPROTO_TCP:
|
|
case IPPROTO_UDP:
|
|
case IPPROTO_DCCP:
|
|
case IPPROTO_ESP:
|
|
case IPPROTO_SCTP:
|
|
case IPPROTO_UDPLITE:
|
|
default:
|
|
return 0;
|
|
case IPPROTO_AH:
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off)
|
|
{
|
|
return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
|
|
(IP_MF | IP_OFFSET));
|
|
}
|
|
|
|
static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
|
|
__u8 *ip_proto, struct flow_keys *flow)
|
|
{
|
|
__u8 ip_ver_len;
|
|
|
|
if (unlikely(flow_is_frag(skb, nh_off)))
|
|
*ip_proto = 0;
|
|
else
|
|
*ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
|
|
protocol));
|
|
if (*ip_proto != IPPROTO_GRE) {
|
|
flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
|
|
flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
|
|
}
|
|
|
|
ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
|
|
if (likely(ip_ver_len == 0x45))
|
|
nh_off += 20;
|
|
else
|
|
nh_off += (ip_ver_len & 0xF) << 2;
|
|
|
|
return nh_off;
|
|
}
|
|
|
|
static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
|
|
{
|
|
__u32 w0 = load_word(skb, off);
|
|
__u32 w1 = load_word(skb, off + sizeof(w0));
|
|
__u32 w2 = load_word(skb, off + sizeof(w0) * 2);
|
|
__u32 w3 = load_word(skb, off + sizeof(w0) * 3);
|
|
|
|
return w0 ^ w1 ^ w2 ^ w3;
|
|
}
|
|
|
|
static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
|
|
__u8 *ip_proto, struct flow_keys *flow)
|
|
{
|
|
*ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
|
|
|
|
flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
|
|
flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
|
|
|
|
return nh_off + sizeof(struct ipv6hdr);
|
|
}
|
|
|
|
static inline bool flow_dissector(struct __sk_buff *skb,
|
|
struct flow_keys *flow)
|
|
{
|
|
int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
|
|
__be16 proto = skb->protocol;
|
|
__u8 ip_proto;
|
|
|
|
/* TODO: check for skb->vlan_tci, skb->vlan_proto first */
|
|
if (proto == htons(ETH_P_8021AD)) {
|
|
proto = load_half(skb, nh_off +
|
|
offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
|
|
nh_off += sizeof(struct vlan_hdr);
|
|
}
|
|
if (proto == htons(ETH_P_8021Q)) {
|
|
proto = load_half(skb, nh_off +
|
|
offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
|
|
nh_off += sizeof(struct vlan_hdr);
|
|
}
|
|
|
|
if (likely(proto == htons(ETH_P_IP)))
|
|
nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
|
|
else if (proto == htons(ETH_P_IPV6))
|
|
nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
|
|
else
|
|
return false;
|
|
|
|
switch (ip_proto) {
|
|
case IPPROTO_GRE: {
|
|
struct gre_hdr {
|
|
__be16 flags;
|
|
__be16 proto;
|
|
};
|
|
|
|
__u16 gre_flags = load_half(skb, nh_off +
|
|
offsetof(struct gre_hdr, flags));
|
|
__u16 gre_proto = load_half(skb, nh_off +
|
|
offsetof(struct gre_hdr, proto));
|
|
|
|
if (gre_flags & (GRE_VERSION | GRE_ROUTING))
|
|
break;
|
|
|
|
nh_off += 4;
|
|
if (gre_flags & GRE_CSUM)
|
|
nh_off += 4;
|
|
if (gre_flags & GRE_KEY)
|
|
nh_off += 4;
|
|
if (gre_flags & GRE_SEQ)
|
|
nh_off += 4;
|
|
|
|
if (gre_proto == ETH_P_8021Q) {
|
|
gre_proto = load_half(skb, nh_off +
|
|
offsetof(struct vlan_hdr,
|
|
h_vlan_encapsulated_proto));
|
|
nh_off += sizeof(struct vlan_hdr);
|
|
}
|
|
if (gre_proto == ETH_P_IP)
|
|
nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
|
|
else if (gre_proto == ETH_P_IPV6)
|
|
nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
|
|
else
|
|
return false;
|
|
break;
|
|
}
|
|
case IPPROTO_IPIP:
|
|
nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
|
|
break;
|
|
case IPPROTO_IPV6:
|
|
nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
|
|
default:
|
|
break;
|
|
}
|
|
|
|
nh_off += flow_ports_offset(ip_proto);
|
|
|
|
flow->ports = load_word(skb, nh_off);
|
|
flow->th_off = nh_off;
|
|
flow->ip_proto = ip_proto;
|
|
|
|
return true;
|
|
}
|
|
|
|
static inline void cls_update_proto_map(const struct __sk_buff *skb,
|
|
const struct flow_keys *flow)
|
|
{
|
|
uint8_t proto = flow->ip_proto;
|
|
struct count_tuple *ct, _ct;
|
|
|
|
ct = map_lookup_elem(&map_proto, &proto);
|
|
if (likely(ct)) {
|
|
lock_xadd(&ct->packets, 1);
|
|
lock_xadd(&ct->bytes, skb->len);
|
|
return;
|
|
}
|
|
|
|
/* No hit yet, we need to create a new entry. */
|
|
_ct.packets = 1;
|
|
_ct.bytes = skb->len;
|
|
|
|
map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
|
|
}
|
|
|
|
static inline void cls_update_queue_map(const struct __sk_buff *skb)
|
|
{
|
|
uint32_t queue = skb->queue_mapping;
|
|
struct count_queue *cq, _cq;
|
|
bool mismatch;
|
|
|
|
mismatch = skb->queue_mapping != get_smp_processor_id();
|
|
|
|
cq = map_lookup_elem(&map_queue, &queue);
|
|
if (likely(cq)) {
|
|
lock_xadd(&cq->total, 1);
|
|
if (mismatch)
|
|
lock_xadd(&cq->mismatch, 1);
|
|
return;
|
|
}
|
|
|
|
/* No hit yet, we need to create a new entry. */
|
|
_cq.total = 1;
|
|
_cq.mismatch = mismatch ? 1 : 0;
|
|
|
|
map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
|
|
}
|
|
|
|
/* eBPF program definitions, placed in various sections, which can
|
|
* have custom section names. If custom names are in use, it's
|
|
* required to point tc to the correct section, e.g.
|
|
*
|
|
* tc filter add [...] bpf obj cls.o sec cls-tos [...]
|
|
*
|
|
* in case the program resides in __section("cls-tos").
|
|
*
|
|
* Default section for cls_bpf is: "classifier", for act_bpf is:
|
|
* "action". Naturally, if for example multiple actions are present
|
|
* in the same file, they need to have distinct section names.
|
|
*
|
|
* It is however not required to have multiple programs sharing
|
|
* a file.
|
|
*/
|
|
__section("classifier")
|
|
int cls_main(struct __sk_buff *skb)
|
|
{
|
|
struct flow_keys flow;
|
|
|
|
if (!flow_dissector(skb, &flow))
|
|
return 0; /* No match in cls_bpf. */
|
|
|
|
cls_update_proto_map(skb, &flow);
|
|
cls_update_queue_map(skb);
|
|
|
|
return flow.ip_proto;
|
|
}
|
|
|
|
static inline void act_update_drop_map(void)
|
|
{
|
|
uint32_t *count, cpu = get_smp_processor_id();
|
|
|
|
count = map_lookup_elem(&map_drops, &cpu);
|
|
if (count)
|
|
/* Only this cpu is accessing this element. */
|
|
(*count)++;
|
|
}
|
|
|
|
__section("action-mark")
|
|
int act_mark_main(struct __sk_buff *skb)
|
|
{
|
|
/* You could also mangle skb data here with the helper function
|
|
* BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
|
|
* do that already in the classifier itself as a merged combination
|
|
* of classifier'n'action model.
|
|
*/
|
|
|
|
if (skb->mark == 0xcafe) {
|
|
act_update_drop_map();
|
|
return TC_ACT_SHOT;
|
|
}
|
|
|
|
/* Default configured tc opcode. */
|
|
return TC_ACT_UNSPEC;
|
|
}
|
|
|
|
__section("action-rand")
|
|
int act_rand_main(struct __sk_buff *skb)
|
|
{
|
|
/* Sorry, we're near event horizon ... */
|
|
if ((get_prandom_u32() & 3) == 0) {
|
|
act_update_drop_map();
|
|
return TC_ACT_SHOT;
|
|
}
|
|
|
|
return TC_ACT_UNSPEC;
|
|
}
|
|
|
|
/* Last but not least, the file contains a license. Some future helper
|
|
* functions may only be available with a GPL license.
|
|
*/
|
|
BPF_LICENSE("GPL");
|