The classic bpf (classical Berkeley Packet Filter) is a technique that works very well to improve performance in some special Go underlying network programming situations.

Background

I have previously developed a Go UDP application where the client and server communicate via a raw socket via a UDP program. The purpose of the program is rather specific, so I’ll present a simple program as an example here.

Actually, I’m not being strict when I say I’m using the rawsocket approach.

I did not use the following way to implement sockets and communicate (link layer way).

1
2
3
4
5
6
7
fd, err:= syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW,syscall.ETH_P_ALL)
if (err != nil) {
    fmt.Println("Error: " + err.Error())
    return;
}
fmt.Println("Obtained fd ", fd)
defer syscall.Close(fd)

The rawsocket approach (IP layer approach) below is also not used.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
func main() {
    var err error
    fd, e := syscall.Socket(syscall.AF_INET, syscall.SOCK_RAW, syscall.IPPROTO_UDP)
    if e != nil {
        fmt.Println("Problem @ location 1")
    }
    addr := syscall.SockaddrInet4{
        Port: 27289,
        Addr: [4]byte{127, 0, 0, 1},
    }
    p := pkt()
    err = syscall.Sendto(fd, p, 0, &addr)
    if err != nil {
        log.Fatal("Sendto:", err)
    }
}

Instead, it directly uses the method net.ListenPacket("ip4:udp", addr) encapsulated in the Go standard library to send and receive packets at the IP layer.

I implement custom packet sending and receiving by wrapping custom UDP layer data structure for network monitoring.

Some people may say, just use the standard library UDPConn. If it is an ordinary UDP program, it is indeed no problem, if for some special needs, such as listening to 1000 UDP ports, there are tens of thousands of nodes regularly send monitoring data, we are unlikely to build 1000 * 10,000 UDPConn, so here I use the rawsocket communication method.

RawSocket is part of the standard Berkeley socket. When we use Go’s standard library to develop network programs, most of the scenarios use the encapsulated datagram type (UDPConn) or stream type (TCPConn), but if you want to do some lower-level network programming, you need to use RawSocket, such as The underlying protocols are TCP, UDP, ICMP, ARP, etc. Different operating systems may implement RawSocket differently, here we take Linux environment as an example.

The Linux man manual gives a detailed introduction to RawSocket related knowledge: socket(2), packet(7), raw(7), which will not be reproduced in this article, and is not the focus of this article.

According to the Linux documentation, packets received in the Linux server are passed to both the kernel network module and RawSocket, so you sometimes need to be careful when using RawSocket, for example, if you are processing a TCP packet, the Linux kernel network program may have already processed the packet.

Raw sockets may tap all IP protocols in Linux, even protocols like ICMP or TCP which have a protocol module in the kernel. In this case, the packets are passed to both the kernel module and the raw socket(s). This should not be relied upon in portable programs, many other BSD socket implementation have limitations here.

If there are no special requirements, we will just use net.ListenPacket to implement a RawSocket program. The signature of this method is as follows:

1
func ListenPacket(network, address string) (PacketConn, error)

The first parameter network can be udp, udp4, udp6, unixgram, or ip, ip4, ip6 with a colon and a protocol number or protocol name, such as ip:1, ip:icmp.

Demo program

Server-side program

The server-side program uses conn, err := net.ListenPacket("ip4:udp", *addr) to listen for all UDP packets on the local address and start a goroutine to process them. There should be another judgment in the handler, which is to check if the UDP port is the one we are dealing with, because here net.ListenPacket is listening to all the local UDP, and there may be a lot of useless UDP packets that are passed to the user state program.

Here we use gopacket’s definition of packets for various protocol layers to facilitate the parsing (or creation) of network protocols for each TCP/IP layer.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package main
import (
    "flag"
    "log"
    "net"
    "github.com/google/gopacket"
    "github.com/google/gopacket/layers"
    "github.com/smallnest/go-network-programming/codec"
    "golang.org/x/net/bpf"
    "golang.org/x/net/ipv4"
)
var (
    addr = flag.String("s", "localhost", "server address")
    port = flag.Int("p", 8972, "port")
)
var (
    stat         = make(map[string]int)
    lastStatTime = int64(0)
)
func main() {
    flag.Parse()
    conn, err := net.ListenPacket("ip4:udp", *addr)
    if err != nil {
        panic(err)
    }
    cc := conn.(*net.IPConn)
    cc.SetReadBuffer(20 * 1024 * 1024)
    cc.SetWriteBuffer(20 * 1024 * 1024)
    handleConn(conn)
}
func handleConn(conn net.PacketConn) {
    for {
        buffer := make([]byte, 1024)
        n, remoteaddr, err := conn.ReadFrom(buffer)
        if err != nil {
            log.Fatal(err)
        }
        buffer = buffer[:n]
        packet := gopacket.NewPacket(buffer, layers.LayerTypeUDP, gopacket.NoCopy)
        // Get the UDP layer from this packet
        if udpLayer := packet.Layer(layers.LayerTypeUDP); udpLayer != nil {
            udp, _ := udpLayer.(*layers.UDP)
            if app := packet.ApplicationLayer(); app != nil {
                data, err := codec.EncodeUDPPacket(net.ParseIP("127.0.0.1"), net.ParseIP("127.0.0.1"), uint16(udp.DstPort), uint16(udp.SrcPort), app.Payload())
                if err != nil {
                    log.Printf("failed to EncodePacket: %v", err)
                    return
                }
                if _, err := conn.WriteTo(data, remoteaddr); err != nil {
                    log.Printf("failed to write packet: %v", err)
                    conn.Close()
                    return
                }
            }
        }
    }
}

Client program

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
package main
import (
    "fmt"
    "log"
    "net"
    "github.com/google/gopacket"
    "github.com/google/gopacket/layers"
    "github.com/smallnest/go-network-programming/codec"
)
func main() {
    conn, err := net.ListenPacket("ip4:udp", "127.0.0.1")
    if err != nil {
        panic(err)
    }
    data, err := codec.EncodeUDPPacket(net.ParseIP("127.0.0.1"), net.ParseIP("127.0.0.1"), 8972, 0, []byte("hello"))
    if err != nil {
        log.Printf("failed to EncodePacket: %v", err)
        return
    }
    remoteAddr := &net.IPAddr{IP: net.ParseIP("127.0.0.1")}
    if _, err := conn.WriteTo(data, remoteAddr); err != nil {
        log.Printf("failed to write packet: %v", err)
        conn.Close()
        return
    }
    buffer := make([]byte, 1024)
    n, _, err := conn.ReadFrom(buffer)
    if err != nil {
        log.Fatal(err)
    }
    buffer = buffer[:n]
    packet := gopacket.NewPacket(buffer, layers.LayerTypeUDP, gopacket.NoCopy)
    // Get the UDP layer from this packet
    if udpLayer := packet.Layer(layers.LayerTypeUDP); udpLayer != nil {
        if app := packet.ApplicationLayer(); app != nil {
            fmt.Printf("reply: %s\n", app.Payload())
        }
    }
}

The client program is simplified here by writing a hello and reading the return from the server. When we do performance testing, we use a loop to continuously write a seq number and check whether the server returns this seq in order to calculate the packet loss performance. And also use a flow limiter to limit the flow and test the packet loss rate at a certain RPS.

Auxiliary methods

The following is the EncodeUDPPacket method, which is used to generate a UDP packet data.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
package codec
import (
    "net"
    "github.com/google/gopacket"
    "github.com/google/gopacket/layers"
)
func EncodeUDPPacket(localIP, remoteIP net.IP, localPort, remotePort uint16, payload []byte) ([]byte, error) {
    ip := &layers.IPv4{
        Version:  4,
        TTL:      128,
        SrcIP:    localIP,
        DstIP:    remoteIP,
        Protocol: layers.IPProtocolUDP,
    }
    udp := &layers.UDP{
        SrcPort: layers.UDPPort(localPort),
        DstPort: layers.UDPPort(remotePort),
    }
    udp.SetNetworkLayerForChecksum(ip)
    buf := gopacket.NewSerializeBuffer()
    opts := gopacket.SerializeOptions{
        ComputeChecksums: true,
        FixLengths:       true,
    }
    err := gopacket.SerializeLayers(buf, opts, udp, gopacket.Payload(payload))
    return buf.Bytes(), err
}

Performance issues

Although the above program runs fine, there are some problems in case of higher concurrency.

Above we started a goroutine to read this packet, here is a performance bottleneck, and eventually the server can only use one core to handle the RawSocket packet.

Even if you create multiple goroutines to read this PacketConn, it is useless because this PacketConn is unique and it is a bottleneck. Sometimes it is better to use only one goroutine to read it than multiple goroutines.

So can we call net.ListenPacket("ip4:udp", *addr) multiple times to generate multiple RawSockets to process concurrently?

It seems to work, but in reality, the multiple RawSockets all read the same UDPPacket instead of being load balanced and spread out over multiple Sockets. So multiple RawSockets are not only useless, but also consume more resources of the server.

The actual test can only reach 20-30 thousand throughput, and the higher the concurrency, the higher the packet loss.

But is there no way out?

Not really. Here we can see that the main performance bottleneck is that our program above has no way to do load balancing, using the power of multiple cores to concurrently process. The second performance bottleneck is that the program listens to all the UDP packets on the local machine and hands them over to the user state program to filter and process, which has a lot of packets we don’t need.

Both of these performance problems can be handled by BPF.

BPF for Packet Filtering

The classic BPF has been around since 1994, and although everyone is now talking about the extended BPF (eBPF), the classic BPF still has the power to make it work.

You may not have applied BPF in programming, but I believe you must have worked with it in practice.

For example, when you use tcpdump to listen to the transmission of the network, you often add filters, such as the following command to listen only to port 8080 of the tcp protocol:

1
tcpdump -nn -vvvv -i any "tcp port 8080"

tcpdump actually generates tcp port 8080 as a filter, filters the packets in the kernel, and filters out only the filtered packets.

You can actually view the compiled filtering code with the following command.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
[root@lab ~]# tcpdump -d "tcp port 8080"
(000) ldh      [12]
(001) jeq      #0x86dd          jt 2    jf 8
(002) ldb      [20]
(003) jeq      #0x6             jt 4    jf 19
(004) ldh      [54]
(005) jeq      #0x1f90          jt 18   jf 6
(006) ldh      [56]
(007) jeq      #0x1f90          jt 18   jf 19
(008) jeq      #0x800           jt 9    jf 19
(009) ldb      [23]
(010) jeq      #0x6             jt 11   jf 19
(011) ldh      [20]
(012) jset     #0x1fff          jt 19   jf 13
(013) ldxb     4*([14]&0xf)
(014) ldh      [x + 14]
(015) jeq      #0x1f90          jt 18   jf 16
(016) ldh      [x + 16]
(017) jeq      #0x1f90          jt 18   jf 19
(018) ret      #262144
(019) ret      #0

What does this mean? BPF defines a limited number of instructions to filter the packets in the VM.

The first line is to load the packet offset (offset 12 bytes), the second line is to check if it is IPV6, if so jump to 002, if not jump to 008. Let’s focus on IPV4.

The 008 line is to determine if it is IPv4, if it is jump to 009. The 009 line loads a byte at offset 23, which is the ip proto, and the 010 line determines if the ip proto is TCP, if so, jump to 011.

Next, determine the flags in order to determine the address of the data.

Lines 014 and 016 read the source and destination ports in the TCP protocol, if they are equal to 8080 ( 0x1f90 ), the maximum return packet size is 262144 bytes, otherwise the packet is discarded.

Of course the code generated by tcpdump is quite rigorous. When we actually write it, if we are sure that it is an ipv4 package and the package is not much extended, the code is simpler than this. But we might as well use the code generated by tcpdump when we actually apply BPF without any errors.

Use -dd to display it as a c code fragment, and -ddd to display it as a decimal number. Let’s look at the effect of -dd, as this result we can use to convert to Go code.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
[root@lab ~]# tcpdump -dd "tcp port 8080"
{ 0x28, 0, 0, 0x0000000c },
{ 0x15, 0, 6, 0x000086dd },
{ 0x30, 0, 0, 0x00000014 },
{ 0x15, 0, 15, 0x00000006 },
{ 0x28, 0, 0, 0x00000036 },
{ 0x15, 12, 0, 0x00001f90 },
{ 0x28, 0, 0, 0x00000038 },
{ 0x15, 10, 11, 0x00001f90 },
{ 0x15, 0, 10, 0x00000800 },
{ 0x30, 0, 0, 0x00000017 },
{ 0x15, 0, 8, 0x00000006 },
{ 0x28, 0, 0, 0x00000014 },
{ 0x45, 6, 0, 0x00001fff },
{ 0xb1, 0, 0, 0x0000000e },
{ 0x48, 0, 0, 0x0000000e },
{ 0x15, 2, 0, 0x00001f90 },
{ 0x48, 0, 0, 0x00000010 },
{ 0x15, 0, 1, 0x00001f90 },
{ 0x6, 0, 0, 0x00040000 },
{ 0x6, 0, 0, 0x00000000 },

In fact, x/net/bpf provides the corresponding methods to write BPF programs, serializing and deserializing more easily. For example, to write a program that filters out all packets with destination port equal to 8972, we can simply write it in the following format (considering the simple form, we only considered the form of IPV4 and normal IP packets):

1
2
3
4
5
6
7
type Filter []bpf.Instruction
var filter = Filter{
    bpf.LoadAbsolute{Off: 22, Size: 2},                                //Load destination port to register
    bpf.JumpIf{Cond: bpf.JumpEqual, Val: 8972, SkipFalse: 1}, // If the value is equal to 8972, then execute the next line, otherwise skip the next line
    bpf.RetConstant{Val: 0xffff},   // Returns up to 0xffff bytes of data for this packet
    bpf.RetConstant{Val: 0x0}, // Return zero bytes, i.e. ignore the packet
}

We can write a program that converts the code generated by tcpdump into bpf’s RawInstruction instruction:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
func parse(data string) (raws []bpf.RawInstruction) {
    lines := strings.Split(data, "\n")
    for _, line := range lines {
        line = strings.TrimSpace(line)
        if line == "" {
            continue
        }
        line = strings.TrimPrefix(line, "{")
        line = strings.TrimSuffix(line, " },")
        items := strings.Split(line, ",")
        // assert len(items) == 4
        raw := bpf.RawInstruction{
            Op: uint16(numToInteger(items[0])),
            Jt: uint8(numToInteger(items[1])),
            Jf: uint8(numToInteger(items[2])),
            K:  uint32(numToInteger(items[3])),
        }
        raws = append(raws, raw)
    }
    return raws
}
func numToInteger(s string) int {
    s = strings.TrimSpace(s)
    if strings.HasPrefix(s, "0x") {
        s := strings.Replace(s, "0x", "", -1)
        result, _ := strconv.ParseInt(s, 16, 64)
        return int(result)
    }
    result, _ := strconv.ParseInt(s, 10, 64)
    return int(result)
}

Well all this is ready, the background knowledge is introduced and the performance bottleneck of the current RawSocket program is introduced, so what if the performance bottleneck is solved.

The first performance bottleneck we can generate multiple goroutines, each goroutine is responsible for filtering a part of the packets, so that the load balancing is achieved. For example, filtering based on the IP of the client, or the server listens to 1000 ports, each goroutine is only responsible for a portion of the port. And filtering can be done based on the source port of the client, etc. Always, with BPF filtering, a goroutine is responsible for only a part of the packet, enabling multi-core processing.

The second bottleneck is solved with the first problem. Because BPF only filters our specific ports, UDP packets from other ports are not copied from the kernel state to the user state, reducing the processing of useless packets.

To set BPF filtering for PacketConn of standard library, there are also various ways to do it, such as calling syscall.SetsockoptInt to set it. But golang.org/x/net/ipv4 provides the SetBPF method, so we can directly convert the PacketConn of the standard library to ipv4.PacketConn, and then set it.

For example, in the server program above, we can modify it to use BPF filtering in the kernel state:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package main
import (
    "flag"
    "log"
    "net"
    "github.com/google/gopacket"
    "github.com/google/gopacket/layers"
    "github.com/smallnest/go-network-programming/codec"
    "golang.org/x/net/bpf"
    "golang.org/x/net/ipv4"
)
var (
    addr = flag.String("s", "localhost", "server address")
    port = flag.Int("p", 8972, "port")
)
var (
    stat         = make(map[string]int)
    lastStatTime = int64(0)
)
func main() {
    flag.Parse()
    conn, err := net.ListenPacket("ip4:udp", *addr)
    if err != nil {
        panic(err)
    }
    cc := conn.(*net.IPConn)
    cc.SetReadBuffer(20 * 1024 * 1024)
    cc.SetWriteBuffer(20 * 1024 * 1024)
    pconn := ipv4.NewPacketConn(conn)
    var assembled []bpf.RawInstruction
    if assembled, err = bpf.Assemble(filter); err != nil {
        log.Print(err)
        return
    }
    pconn.SetBPF(assembled)
    handleConn(conn)
}
func handleConn(conn net.PacketConn) {
    for {
        buffer := make([]byte, 1024)
        n, remoteaddr, err := conn.ReadFrom(buffer)
        if err != nil {
            log.Fatal(err)
        }
        buffer = buffer[:n]
        packet := gopacket.NewPacket(buffer, layers.LayerTypeUDP, gopacket.NoCopy)
        // Get the UDP layer from this packet
        if udpLayer := packet.Layer(layers.LayerTypeUDP); udpLayer != nil {
            udp, _ := udpLayer.(*layers.UDP)
            if app := packet.ApplicationLayer(); app != nil {
                data, err := codec.EncodeUDPPacket(net.ParseIP("127.0.0.1"), net.ParseIP("127.0.0.1"), uint16(udp.DstPort), uint16(udp.SrcPort), app.Payload())
                if err != nil {
                    log.Printf("failed to EncodePacket: %v", err)
                    return
                }
                if _, err := conn.WriteTo(data, remoteaddr); err != nil {
                    log.Printf("failed to write packet: %v", err)
                    conn.Close()
                    return
                }
            }
        }
    }
}
type Filter []bpf.Instruction
var filter = Filter{
    bpf.LoadAbsolute{Off: 22, Size: 2},                                // load the destination port
    bpf.JumpIf{Cond: bpf.JumpEqual, Val: uint32(*port), SkipFalse: 1}, // if Val != 8972 skip next instruction
    bpf.RetConstant{Val: 0xffff},                                      // return 0xffff bytes (or less) from packet
    bpf.RetConstant{Val: 0x0}, // return 0 bytes, effectively ignore this packet
}