diff options
Diffstat (limited to 'examples/bpf/bpf_prog.c')
-rw-r--r-- | examples/bpf/bpf_prog.c | 499 |
1 files changed, 0 insertions, 499 deletions
diff --git a/examples/bpf/bpf_prog.c b/examples/bpf/bpf_prog.c deleted file mode 100644 index 47280492..00000000 --- a/examples/bpf/bpf_prog.c +++ /dev/null @@ -1,499 +0,0 @@ -/* - * eBPF kernel space program part - * - * Toy eBPF program for demonstration purposes, some parts derived from - * kernel tree's samples/bpf/sockex2_kern.c example. - * - * More background on eBPF, kernel tree: Documentation/networking/filter.txt - * - * Note, this file is rather large, and most classifier and actions are - * likely smaller to accomplish one specific use-case and are tailored - * for high performance. For performance reasons, you might also have the - * classifier and action already merged inside the classifier. - * - * In order to show various features it serves as a bigger programming - * example, which you should feel free to rip apart and experiment with. - * - * Compilation, configuration example: - * - * Note: as long as the BPF backend in LLVM is still experimental, - * you need to build LLVM with LLVM with --enable-experimental-targets=BPF - * Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y, - * and you have libelf.h and gelf.h headers and can link tc against -lelf. - * - * In case you need to sync kernel headers, go to your kernel source tree: - * # make headers_install INSTALL_HDR_PATH=/usr/ - * - * $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH - * $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o - * $ objdump -h bpf.o - * [...] - * 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3 - * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE - * 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3 - * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE - * 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3 - * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE - * 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2 - * CONTENTS, ALLOC, LOAD, DATA - * 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0 - * CONTENTS, ALLOC, LOAD, DATA - * [...] - * # echo 1 > /proc/sys/net/core/bpf_jit_enable - * $ gcc bpf_agent.c -o bpf_agent -Wall -O2 - * # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal) - * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ - * action bpf obj bpf.o sec action-mark \ - * action bpf obj bpf.o sec action-rand ok - * # tc filter show dev em1 - * filter parent 1: protocol all pref 49152 bpf - * filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier] - * action order 1: bpf bpf.o:[action-mark] default-action pipe - * index 52 ref 1 bind 1 - * - * action order 2: bpf bpf.o:[action-rand] default-action pipe - * index 53 ref 1 bind 1 - * - * action order 3: gact action pass - * random type none pass val 0 - * index 38 ref 1 bind 1 - * - * The same program can also be installed on ingress side (as opposed to above - * egress configuration), e.g.: - * - * # tc qdisc add dev em1 handle ffff: ingress - * # tc filter add dev em1 parent ffff: bpf obj ... - * - * Notes on BPF agent: - * - * In the above example, the bpf_agent creates the unix domain socket - * natively. "tc exec" can also spawn a shell and hold the socktes there: - * - * # tc exec bpf imp /tmp/bpf-uds - * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ - * action bpf obj bpf.o sec action-mark \ - * action bpf obj bpf.o sec action-rand ok - * sh-4.2# (shell spawned from tc exec) - * sh-4.2# bpf_agent - * [...] - * - * This will read out fds over environment and produce the same data dump - * as below. This has the advantage that the spawned shell owns the fds - * and thus if the agent is restarted, it can reattach to the same fds, also - * various programs can easily read/modify the data simultaneously from user - * space side. - * - * If the shell is unnecessary, the agent can also just be spawned directly - * via tc exec: - * - * # tc exec bpf imp /tmp/bpf-uds run bpf_agent - * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ - * action bpf obj bpf.o sec action-mark \ - * action bpf obj bpf.o sec action-rand ok - * - * BPF agent example output: - * - * ver: 1 - * obj: bpf.o - * dev: 64770 - * ino: 6045133 - * maps: 3 - * map0: - * `- fd: 4 - * | serial: 1 - * | type: 1 - * | max elem: 256 - * | size key: 1 - * ` size val: 16 - * map1: - * `- fd: 5 - * | serial: 2 - * | type: 1 - * | max elem: 1024 - * | size key: 4 - * ` size val: 16 - * map2: - * `- fd: 6 - * | serial: 3 - * | type: 2 - * | max elem: 64 - * | size key: 4 - * ` size val: 8 - * data, period: 5sec - * `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0 - * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0] - * ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0] - * data, period: 5sec - * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1 - * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0] - * ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0] - * data, period: 5sec - * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3 - * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0] - * ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0] - * [...] - * - * This now means, the below classifier and action pipeline has been loaded - * as eBPF bytecode into the kernel, the kernel has verified that the - * execution of the bytecode is "safe", and it has JITed the programs - * afterwards, so that upon invocation they're running on native speed. tc - * has transferred all map file descriptors to the bpf_agent via IPC and - * even after tc exits, the agent can read out or modify all map data. - * - * Note that the export to the uds is done only once in the classifier and - * not in the action. It's enough to export the (here) shared descriptors - * once. - * - * If you need to disassemble the generated JIT image (echo with 2), the - * kernel tree has under tools/net/ a small helper, you can invoke e.g. - * `bpf_jit_disasm -o`. - * - * Please find in the code below further comments. - * - * -- Happy eBPF hacking! ;) - */ -#include <stdint.h> -#include <stdbool.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <asm/types.h> -#include <linux/in.h> -#include <linux/if.h> -#include <linux/if_ether.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/if_tunnel.h> -#include <linux/filter.h> -#include <linux/bpf.h> - -/* Common, shared definitions with ebpf_agent.c. */ -#include "bpf_shared.h" -/* BPF helper functions for our example. */ -#include "../../include/bpf_api.h" - -/* Could be defined here as well, or included from the header. */ -#define TC_ACT_UNSPEC (-1) -#define TC_ACT_OK 0 -#define TC_ACT_RECLASSIFY 1 -#define TC_ACT_SHOT 2 -#define TC_ACT_PIPE 3 -#define TC_ACT_STOLEN 4 -#define TC_ACT_QUEUED 5 -#define TC_ACT_REPEAT 6 - -/* Other, misc stuff. */ -#define IP_MF 0x2000 -#define IP_OFFSET 0x1FFF - -/* eBPF map definitions, all placed in section "maps". */ -struct bpf_elf_map __section("maps") map_proto = { - .type = BPF_MAP_TYPE_HASH, - .id = BPF_MAP_ID_PROTO, - .size_key = sizeof(uint8_t), - .size_value = sizeof(struct count_tuple), - .max_elem = 256, -}; - -struct bpf_elf_map __section("maps") map_queue = { - .type = BPF_MAP_TYPE_HASH, - .id = BPF_MAP_ID_QUEUE, - .size_key = sizeof(uint32_t), - .size_value = sizeof(struct count_queue), - .max_elem = 1024, -}; - -struct bpf_elf_map __section("maps") map_drops = { - .type = BPF_MAP_TYPE_ARRAY, - .id = BPF_MAP_ID_DROPS, - .size_key = sizeof(uint32_t), - .size_value = sizeof(long), - .max_elem = 64, -}; - -/* Helper functions and definitions for the flow dissector used by the - * example classifier. This resembles the kernel's flow dissector to - * some extend and is just used as an example to show what's possible - * with eBPF. - */ -struct sockaddr; - -struct vlan_hdr { - __be16 h_vlan_TCI; - __be16 h_vlan_encapsulated_proto; -}; - -struct flow_keys { - __u32 src; - __u32 dst; - union { - __u32 ports; - __u16 port16[2]; - }; - __s32 th_off; - __u8 ip_proto; -}; - -static inline int flow_ports_offset(__u8 ip_proto) -{ - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - default: - return 0; - case IPPROTO_AH: - return 4; - } -} - -static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off) -{ - return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) & - (IP_MF | IP_OFFSET)); -} - -static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off, - __u8 *ip_proto, struct flow_keys *flow) -{ - __u8 ip_ver_len; - - if (unlikely(flow_is_frag(skb, nh_off))) - *ip_proto = 0; - else - *ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr, - protocol)); - if (*ip_proto != IPPROTO_GRE) { - flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr)); - flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr)); - } - - ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */); - if (likely(ip_ver_len == 0x45)) - nh_off += 20; - else - nh_off += (ip_ver_len & 0xF) << 2; - - return nh_off; -} - -static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off) -{ - __u32 w0 = load_word(skb, off); - __u32 w1 = load_word(skb, off + sizeof(w0)); - __u32 w2 = load_word(skb, off + sizeof(w0) * 2); - __u32 w3 = load_word(skb, off + sizeof(w0) * 3); - - return w0 ^ w1 ^ w2 ^ w3; -} - -static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off, - __u8 *ip_proto, struct flow_keys *flow) -{ - *ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr)); - - flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr)); - flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr)); - - return nh_off + sizeof(struct ipv6hdr); -} - -static inline bool flow_dissector(struct __sk_buff *skb, - struct flow_keys *flow) -{ - int poff, nh_off = BPF_LL_OFF + ETH_HLEN; - __be16 proto = skb->protocol; - __u8 ip_proto; - - /* TODO: check for skb->vlan_tci, skb->vlan_proto first */ - if (proto == htons(ETH_P_8021AD)) { - proto = load_half(skb, nh_off + - offsetof(struct vlan_hdr, h_vlan_encapsulated_proto)); - nh_off += sizeof(struct vlan_hdr); - } - if (proto == htons(ETH_P_8021Q)) { - proto = load_half(skb, nh_off + - offsetof(struct vlan_hdr, h_vlan_encapsulated_proto)); - nh_off += sizeof(struct vlan_hdr); - } - - if (likely(proto == htons(ETH_P_IP))) - nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); - else if (proto == htons(ETH_P_IPV6)) - nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); - else - return false; - - switch (ip_proto) { - case IPPROTO_GRE: { - struct gre_hdr { - __be16 flags; - __be16 proto; - }; - - __u16 gre_flags = load_half(skb, nh_off + - offsetof(struct gre_hdr, flags)); - __u16 gre_proto = load_half(skb, nh_off + - offsetof(struct gre_hdr, proto)); - - if (gre_flags & (GRE_VERSION | GRE_ROUTING)) - break; - - nh_off += 4; - if (gre_flags & GRE_CSUM) - nh_off += 4; - if (gre_flags & GRE_KEY) - nh_off += 4; - if (gre_flags & GRE_SEQ) - nh_off += 4; - - if (gre_proto == ETH_P_8021Q) { - gre_proto = load_half(skb, nh_off + - offsetof(struct vlan_hdr, - h_vlan_encapsulated_proto)); - nh_off += sizeof(struct vlan_hdr); - } - if (gre_proto == ETH_P_IP) - nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); - else if (gre_proto == ETH_P_IPV6) - nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); - else - return false; - break; - } - case IPPROTO_IPIP: - nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); - break; - case IPPROTO_IPV6: - nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); - default: - break; - } - - nh_off += flow_ports_offset(ip_proto); - - flow->ports = load_word(skb, nh_off); - flow->th_off = nh_off; - flow->ip_proto = ip_proto; - - return true; -} - -static inline void cls_update_proto_map(const struct __sk_buff *skb, - const struct flow_keys *flow) -{ - uint8_t proto = flow->ip_proto; - struct count_tuple *ct, _ct; - - ct = map_lookup_elem(&map_proto, &proto); - if (likely(ct)) { - lock_xadd(&ct->packets, 1); - lock_xadd(&ct->bytes, skb->len); - return; - } - - /* No hit yet, we need to create a new entry. */ - _ct.packets = 1; - _ct.bytes = skb->len; - - map_update_elem(&map_proto, &proto, &_ct, BPF_ANY); -} - -static inline void cls_update_queue_map(const struct __sk_buff *skb) -{ - uint32_t queue = skb->queue_mapping; - struct count_queue *cq, _cq; - bool mismatch; - - mismatch = skb->queue_mapping != get_smp_processor_id(); - - cq = map_lookup_elem(&map_queue, &queue); - if (likely(cq)) { - lock_xadd(&cq->total, 1); - if (mismatch) - lock_xadd(&cq->mismatch, 1); - return; - } - - /* No hit yet, we need to create a new entry. */ - _cq.total = 1; - _cq.mismatch = mismatch ? 1 : 0; - - map_update_elem(&map_queue, &queue, &_cq, BPF_ANY); -} - -/* eBPF program definitions, placed in various sections, which can - * have custom section names. If custom names are in use, it's - * required to point tc to the correct section, e.g. - * - * tc filter add [...] bpf obj cls.o sec cls-tos [...] - * - * in case the program resides in __section("cls-tos"). - * - * Default section for cls_bpf is: "classifier", for act_bpf is: - * "action". Naturally, if for example multiple actions are present - * in the same file, they need to have distinct section names. - * - * It is however not required to have multiple programs sharing - * a file. - */ -__section("classifier") -int cls_main(struct __sk_buff *skb) -{ - struct flow_keys flow; - - if (!flow_dissector(skb, &flow)) - return 0; /* No match in cls_bpf. */ - - cls_update_proto_map(skb, &flow); - cls_update_queue_map(skb); - - return flow.ip_proto; -} - -static inline void act_update_drop_map(void) -{ - uint32_t *count, cpu = get_smp_processor_id(); - - count = map_lookup_elem(&map_drops, &cpu); - if (count) - /* Only this cpu is accessing this element. */ - (*count)++; -} - -__section("action-mark") -int act_mark_main(struct __sk_buff *skb) -{ - /* You could also mangle skb data here with the helper function - * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could - * do that already in the classifier itself as a merged combination - * of classifier'n'action model. - */ - - if (skb->mark == 0xcafe) { - act_update_drop_map(); - return TC_ACT_SHOT; - } - - /* Default configured tc opcode. */ - return TC_ACT_UNSPEC; -} - -__section("action-rand") -int act_rand_main(struct __sk_buff *skb) -{ - /* Sorry, we're near event horizon ... */ - if ((get_prandom_u32() & 3) == 0) { - act_update_drop_map(); - return TC_ACT_SHOT; - } - - return TC_ACT_UNSPEC; -} - -/* Last but not least, the file contains a license. Some future helper - * functions may only be available with a GPL license. - */ -BPF_LICENSE("GPL"); |