Skip to content

Commit cdac47d

Browse files
jwhitedraggi
andcommitted
conn, device, tun: implement vectorized I/O on Linux
This commit changes the tun.Device and conn.Bind interfaces to accept packet vectors for reading and writing. Internal plumbing between these interfaces now passes a vector of packets. Vectors move untouched between these interfaces, i.e. if 128 packets are received from conn.Bind.Read(), 128 packets are passed to tun.Device.Write(). There is no internal buffering. Platform-specific implementations of tun.Device have been updated to the new tun.Device interface, but only Linux supports passing more than one packet for now. The Linux tun.Device implementation accomplishes this via TSO and GRO, which is made possible by virtio extensions in the TUN driver. conn.LinuxSocketEndpoint has been deleted in favor of a collapsed conn.StdNetBind. conn.StdNetBind makes use of recvmmsg() and sendmmsg() on Linux. All platforms fall under conn.StdNetBind, except for Windows, which remains in conn.WinRingBind. Sticky sockets support has been refactored as part of this work to eventually be applicable on platforms other than just Linux, however Linux remains the sole platform that fully implements it. The conn.Bind UDP socket buffer is now being sized to 7MB, whereas it was previously inheriting the system default. Signed-off-by: Jordan Whited <[email protected]> Signed-off-by: James Tucker <[email protected]> Co-authored-by: James Tucker <[email protected]>
1 parent 2163620 commit cdac47d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+2864
-1197
lines changed

conn/bind_linux.go

Lines changed: 0 additions & 562 deletions
This file was deleted.

conn/bind_std.go

Lines changed: 241 additions & 83 deletions
Large diffs are not rendered by default.

conn/bind_windows.go

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,11 @@ func (bind *WinRingBind) Close() error {
321321
return nil
322322
}
323323

324+
func (bind *WinRingBind) BatchSize() int {
325+
// TODO: implement batching in and out of the ring
326+
return 1
327+
}
328+
324329
func (bind *WinRingBind) SetMark(mark uint32) error {
325330
return nil
326331
}
@@ -409,16 +414,22 @@ retry:
409414
return n, &ep, nil
410415
}
411416

412-
func (bind *WinRingBind) receiveIPv4(buf []byte) (int, Endpoint, error) {
417+
func (bind *WinRingBind) receiveIPv4(buffs [][]byte, sizes []int, eps []Endpoint) (int, error) {
413418
bind.mu.RLock()
414419
defer bind.mu.RUnlock()
415-
return bind.v4.Receive(buf, &bind.isOpen)
420+
n, ep, err := bind.v4.Receive(buffs[0], &bind.isOpen)
421+
sizes[0] = n
422+
eps[0] = ep
423+
return 1, err
416424
}
417425

418-
func (bind *WinRingBind) receiveIPv6(buf []byte) (int, Endpoint, error) {
426+
func (bind *WinRingBind) receiveIPv6(buffs [][]byte, sizes []int, eps []Endpoint) (int, error) {
419427
bind.mu.RLock()
420428
defer bind.mu.RUnlock()
421-
return bind.v6.Receive(buf, &bind.isOpen)
429+
n, ep, err := bind.v6.Receive(buffs[0], &bind.isOpen)
430+
sizes[0] = n
431+
eps[0] = ep
432+
return 1, err
422433
}
423434

424435
func (bind *afWinRingBind) Send(buf []byte, nend *WinRingEndpoint, isOpen *atomic.Uint32) error {
@@ -473,32 +484,38 @@ func (bind *afWinRingBind) Send(buf []byte, nend *WinRingEndpoint, isOpen *atomi
473484
return winrio.SendEx(bind.rq, dataBuffer, 1, nil, addressBuffer, nil, nil, 0, 0)
474485
}
475486

476-
func (bind *WinRingBind) Send(buf []byte, endpoint Endpoint) error {
487+
func (bind *WinRingBind) Send(buffs [][]byte, endpoint Endpoint) error {
477488
nend, ok := endpoint.(*WinRingEndpoint)
478489
if !ok {
479490
return ErrWrongEndpointType
480491
}
481492
bind.mu.RLock()
482493
defer bind.mu.RUnlock()
483-
switch nend.family {
484-
case windows.AF_INET:
485-
if bind.v4.blackhole {
486-
return nil
487-
}
488-
return bind.v4.Send(buf, nend, &bind.isOpen)
489-
case windows.AF_INET6:
490-
if bind.v6.blackhole {
491-
return nil
494+
for _, buf := range buffs {
495+
switch nend.family {
496+
case windows.AF_INET:
497+
if bind.v4.blackhole {
498+
continue
499+
}
500+
if err := bind.v4.Send(buf, nend, &bind.isOpen); err != nil {
501+
return err
502+
}
503+
case windows.AF_INET6:
504+
if bind.v6.blackhole {
505+
continue
506+
}
507+
if err := bind.v6.Send(buf, nend, &bind.isOpen); err != nil {
508+
return err
509+
}
492510
}
493-
return bind.v6.Send(buf, nend, &bind.isOpen)
494511
}
495512
return nil
496513
}
497514

498-
func (bind *StdNetBind) BindSocketToInterface4(interfaceIndex uint32, blackhole bool) error {
499-
bind.mu.Lock()
500-
defer bind.mu.Unlock()
501-
sysconn, err := bind.ipv4.SyscallConn()
515+
func (s *StdNetBind) BindSocketToInterface4(interfaceIndex uint32, blackhole bool) error {
516+
s.mu.Lock()
517+
defer s.mu.Unlock()
518+
sysconn, err := s.ipv4.SyscallConn()
502519
if err != nil {
503520
return err
504521
}
@@ -511,14 +528,14 @@ func (bind *StdNetBind) BindSocketToInterface4(interfaceIndex uint32, blackhole
511528
if err != nil {
512529
return err
513530
}
514-
bind.blackhole4 = blackhole
531+
s.blackhole4 = blackhole
515532
return nil
516533
}
517534

518-
func (bind *StdNetBind) BindSocketToInterface6(interfaceIndex uint32, blackhole bool) error {
519-
bind.mu.Lock()
520-
defer bind.mu.Unlock()
521-
sysconn, err := bind.ipv6.SyscallConn()
535+
func (s *StdNetBind) BindSocketToInterface6(interfaceIndex uint32, blackhole bool) error {
536+
s.mu.Lock()
537+
defer s.mu.Unlock()
538+
sysconn, err := s.ipv6.SyscallConn()
522539
if err != nil {
523540
return err
524541
}
@@ -531,7 +548,7 @@ func (bind *StdNetBind) BindSocketToInterface6(interfaceIndex uint32, blackhole
531548
if err != nil {
532549
return err
533550
}
534-
bind.blackhole6 = blackhole
551+
s.blackhole6 = blackhole
535552
return nil
536553
}
537554

conn/bindtest/bindtest.go

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -89,32 +89,39 @@ func (c *ChannelBind) Close() error {
8989
return nil
9090
}
9191

92+
func (c *ChannelBind) BatchSize() int { return 1 }
93+
9294
func (c *ChannelBind) SetMark(mark uint32) error { return nil }
9395

9496
func (c *ChannelBind) makeReceiveFunc(ch chan []byte) conn.ReceiveFunc {
95-
return func(b []byte) (n int, ep conn.Endpoint, err error) {
97+
return func(buffs [][]byte, sizes []int, eps []conn.Endpoint) (n int, err error) {
9698
select {
9799
case <-c.closeSignal:
98-
return 0, nil, net.ErrClosed
100+
return 0, net.ErrClosed
99101
case rx := <-ch:
100-
return copy(b, rx), c.target6, nil
102+
copied := copy(buffs[0], rx)
103+
sizes[0] = copied
104+
eps[0] = c.target6
105+
return 1, nil
101106
}
102107
}
103108
}
104109

105-
func (c *ChannelBind) Send(b []byte, ep conn.Endpoint) error {
106-
select {
107-
case <-c.closeSignal:
108-
return net.ErrClosed
109-
default:
110-
bc := make([]byte, len(b))
111-
copy(bc, b)
112-
if ep.(ChannelEndpoint) == c.target4 {
113-
*c.tx4 <- bc
114-
} else if ep.(ChannelEndpoint) == c.target6 {
115-
*c.tx6 <- bc
116-
} else {
117-
return os.ErrInvalid
110+
func (c *ChannelBind) Send(buffs [][]byte, ep conn.Endpoint) error {
111+
for _, b := range buffs {
112+
select {
113+
case <-c.closeSignal:
114+
return net.ErrClosed
115+
default:
116+
bc := make([]byte, len(b))
117+
copy(bc, b)
118+
if ep.(ChannelEndpoint) == c.target4 {
119+
*c.tx4 <- bc
120+
} else if ep.(ChannelEndpoint) == c.target6 {
121+
*c.tx6 <- bc
122+
} else {
123+
return os.ErrInvalid
124+
}
118125
}
119126
}
120127
return nil

conn/boundif_android.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66
package conn
77

8-
func (bind *StdNetBind) PeekLookAtSocketFd4() (fd int, err error) {
9-
sysconn, err := bind.ipv4.SyscallConn()
8+
func (s *StdNetBind) PeekLookAtSocketFd4() (fd int, err error) {
9+
sysconn, err := s.ipv4.SyscallConn()
1010
if err != nil {
1111
return -1, err
1212
}
@@ -19,8 +19,8 @@ func (bind *StdNetBind) PeekLookAtSocketFd4() (fd int, err error) {
1919
return
2020
}
2121

22-
func (bind *StdNetBind) PeekLookAtSocketFd6() (fd int, err error) {
23-
sysconn, err := bind.ipv6.SyscallConn()
22+
func (s *StdNetBind) PeekLookAtSocketFd6() (fd int, err error) {
23+
sysconn, err := s.ipv6.SyscallConn()
2424
if err != nil {
2525
return -1, err
2626
}

conn/conn.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,17 @@ import (
1515
"strings"
1616
)
1717

18-
// A ReceiveFunc receives a single inbound packet from the network.
19-
// It writes the data into b. n is the length of the packet.
20-
// ep is the remote endpoint.
21-
type ReceiveFunc func(b []byte) (n int, ep Endpoint, err error)
18+
const (
19+
DefaultBatchSize = 128 // maximum number of packets handled per read and write
20+
)
21+
22+
// A ReceiveFunc receives at least one packet from the network and writes them
23+
// into packets. On a successful read it returns the number of elements of
24+
// sizes, packets, and endpoints that should be evaluated. Some elements of
25+
// sizes may be zero, and callers should ignore them. Callers must pass a sizes
26+
// and eps slice with a length greater than or equal to the length of packets.
27+
// These lengths must not exceed the length of the associated Bind.BatchSize().
28+
type ReceiveFunc func(packets [][]byte, sizes []int, eps []Endpoint) (n int, err error)
2229

2330
// A Bind listens on a port for both IPv6 and IPv4 UDP traffic.
2431
//
@@ -38,11 +45,16 @@ type Bind interface {
3845
// This mark is passed to the kernel as the socket option SO_MARK.
3946
SetMark(mark uint32) error
4047

41-
// Send writes a packet b to address ep.
42-
Send(b []byte, ep Endpoint) error
48+
// Send writes one or more packets in buffs to address ep. The length of
49+
// buffs must not exceed BatchSize().
50+
Send(buffs [][]byte, ep Endpoint) error
4351

4452
// ParseEndpoint creates a new endpoint from a string.
4553
ParseEndpoint(s string) (Endpoint, error)
54+
55+
// BatchSize is the number of buffers expected to be passed to
56+
// the ReceiveFuncs, and the maximum expected to be passed to SendBatch.
57+
BatchSize() int
4658
}
4759

4860
// BindSocketToInterface is implemented by Bind objects that support being

conn/conn_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/* SPDX-License-Identifier: MIT
2+
*
3+
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
4+
*/
5+
6+
package conn
7+
8+
import (
9+
"testing"
10+
)
11+
12+
func TestPrettyName(t *testing.T) {
13+
var (
14+
recvFunc ReceiveFunc = func(buffs [][]byte, sizes []int, eps []Endpoint) (n int, err error) { return }
15+
)
16+
17+
const want = "TestPrettyName"
18+
19+
t.Run("ReceiveFunc.PrettyName", func(t *testing.T) {
20+
if got := recvFunc.PrettyName(); got != want {
21+
t.Errorf("PrettyName() = %v, want %v", got, want)
22+
}
23+
})
24+
}

conn/controlfns.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/* SPDX-License-Identifier: MIT
2+
*
3+
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
4+
*/
5+
6+
package conn
7+
8+
import (
9+
"net"
10+
"syscall"
11+
)
12+
13+
// UDP socket read/write buffer size (7MB). The value of 7MB is chosen as it is
14+
// the max supported by a default configuration of macOS. Some platforms will
15+
// silently clamp the value to other maximums, such as linux clamping to
16+
// net.core.{r,w}mem_max (see _linux.go for additional implementation that works
17+
// around this limitation)
18+
const socketBufferSize = 7 << 20
19+
20+
// controlFn is the callback function signature from net.ListenConfig.Control.
21+
// It is used to apply platform specific configuration to the socket prior to
22+
// bind.
23+
type controlFn func(network, address string, c syscall.RawConn) error
24+
25+
// controlFns is a list of functions that are called from the listen config
26+
// that can apply socket options.
27+
var controlFns = []controlFn{}
28+
29+
// listenConfig returns a net.ListenConfig that applies the controlFns to the
30+
// socket prior to bind. This is used to apply socket buffer sizing and packet
31+
// information OOB configuration for sticky sockets.
32+
func listenConfig() *net.ListenConfig {
33+
return &net.ListenConfig{
34+
Control: func(network, address string, c syscall.RawConn) error {
35+
for _, fn := range controlFns {
36+
if err := fn(network, address, c); err != nil {
37+
return err
38+
}
39+
}
40+
return nil
41+
},
42+
}
43+
}

conn/controlfns_linux.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/* SPDX-License-Identifier: MIT
2+
*
3+
* Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
4+
*/
5+
6+
package conn
7+
8+
import (
9+
"fmt"
10+
"syscall"
11+
12+
"golang.org/x/sys/unix"
13+
)
14+
15+
func init() {
16+
controlFns = append(controlFns,
17+
18+
// Attempt to set the socket buffer size beyond net.core.{r,w}mem_max by
19+
// using SO_*BUFFORCE. This requires CAP_NET_ADMIN, and is allowed here to
20+
// fail silently - the result of failure is lower performance on very fast
21+
// links or high latency links.
22+
func(network, address string, c syscall.RawConn) error {
23+
return c.Control(func(fd uintptr) {
24+
// Set up to *mem_max
25+
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_RCVBUF, socketBufferSize)
26+
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_SNDBUF, socketBufferSize)
27+
// Set beyond *mem_max if CAP_NET_ADMIN
28+
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, socketBufferSize)
29+
_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, socketBufferSize)
30+
})
31+
},
32+
33+
// Enable receiving of the packet information (IP_PKTINFO for IPv4,
34+
// IPV6_PKTINFO for IPv6) that is used to implement sticky socket support.
35+
func(network, address string, c syscall.RawConn) error {
36+
var err error
37+
switch network {
38+
case "udp4":
39+
c.Control(func(fd uintptr) {
40+
err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_PKTINFO, 1)
41+
})
42+
case "udp6":
43+
c.Control(func(fd uintptr) {
44+
err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVPKTINFO, 1)
45+
if err != nil {
46+
return
47+
}
48+
err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_V6ONLY, 1)
49+
})
50+
default:
51+
err = fmt.Errorf("unhandled network: %s: %w", network, unix.EINVAL)
52+
}
53+
return err
54+
},
55+
)
56+
}

0 commit comments

Comments
 (0)