mirror of
https://github.com/containers/podman.git
synced 2025-12-02 02:58:03 +08:00
This includes two new hidden commands: a 9p server, `podman machine server9p`, and a 9p client, `podman machine client9p` with `server9p` currently only configured to run on Windows and serve 9p via HyperV vsock, and `client9p` only configured to run on Linux. The server is run by `podman machine start` and has the same lifespan as gvproxy (waits for the gvproxy PID to die before shutting down). The client is run inside the VM, also by `podman machine start`, and mounts uses kernel 9p mount code to complete the mount. It's unfortunately not possible to use mount directly without the wrapper; we need to set up the vsock and pass it to mount as an FD. In theory this can be generalized so that the server can run anywhere and over almost any transport, but I haven't done this here as I don't think we have a usecase other than HyperV right now. [NO NEW TESTS NEEDED] This requires changes to Podman in the VM, so we need to wait until a build with this lands in FCOS to test. Signed-off-by: Matthew Heon <matthew.heon@pm.me>
881 lines
28 KiB
Go
881 lines
28 KiB
Go
package socket
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"io"
|
|
"os"
|
|
"sync"
|
|
"sync/atomic"
|
|
"syscall"
|
|
"time"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// Lock in an expected public interface for convenience.
|
|
var _ interface {
|
|
io.ReadWriteCloser
|
|
syscall.Conn
|
|
SetDeadline(t time.Time) error
|
|
SetReadDeadline(t time.Time) error
|
|
SetWriteDeadline(t time.Time) error
|
|
} = &Conn{}
|
|
|
|
// A Conn is a low-level network connection which integrates with Go's runtime
|
|
// network poller to provide asynchronous I/O and deadline support.
|
|
//
|
|
// Many of a Conn's blocking methods support net.Conn deadlines as well as
|
|
// cancelation via context. Note that passing a context with a deadline set will
|
|
// override any of the previous deadlines set by calls to the SetDeadline family
|
|
// of methods.
|
|
type Conn struct {
|
|
// Indicates whether or not Conn.Close has been called. Must be accessed
|
|
// atomically. Atomics definitions must come first in the Conn struct.
|
|
closed uint32
|
|
|
|
// A unique name for the Conn which is also associated with derived file
|
|
// descriptors such as those created by accept(2).
|
|
name string
|
|
|
|
// facts contains information we have determined about Conn to trigger
|
|
// alternate behavior in certain functions.
|
|
facts facts
|
|
|
|
// Provides access to the underlying file registered with the runtime
|
|
// network poller, and arbitrary raw I/O calls.
|
|
fd *os.File
|
|
rc syscall.RawConn
|
|
}
|
|
|
|
// facts contains facts about a Conn.
|
|
type facts struct {
|
|
// isStream reports whether this is a streaming descriptor, as opposed to a
|
|
// packet-based descriptor like a UDP socket.
|
|
isStream bool
|
|
|
|
// zeroReadIsEOF reports Whether a zero byte read indicates EOF. This is
|
|
// false for a message based socket connection.
|
|
zeroReadIsEOF bool
|
|
}
|
|
|
|
// A Config contains options for a Conn.
|
|
type Config struct {
|
|
// NetNS specifies the Linux network namespace the Conn will operate in.
|
|
// This option is unsupported on other operating systems.
|
|
//
|
|
// If set (non-zero), Conn will enter the specified network namespace and an
|
|
// error will occur in Socket if the operation fails.
|
|
//
|
|
// If not set (zero), a best-effort attempt will be made to enter the
|
|
// network namespace of the calling thread: this means that any changes made
|
|
// to the calling thread's network namespace will also be reflected in Conn.
|
|
// If this operation fails (due to lack of permissions or because network
|
|
// namespaces are disabled by kernel configuration), Socket will not return
|
|
// an error, and the Conn will operate in the default network namespace of
|
|
// the process. This enables non-privileged use of Conn in applications
|
|
// which do not require elevated privileges.
|
|
//
|
|
// Entering a network namespace is a privileged operation (root or
|
|
// CAP_SYS_ADMIN are required), and most applications should leave this set
|
|
// to 0.
|
|
NetNS int
|
|
}
|
|
|
|
// High-level methods which provide convenience over raw system calls.
|
|
|
|
// Close closes the underlying file descriptor for the Conn, which also causes
|
|
// all in-flight I/O operations to immediately unblock and return errors. Any
|
|
// subsequent uses of Conn will result in EBADF.
|
|
func (c *Conn) Close() error {
|
|
// The caller has expressed an intent to close the socket, so immediately
|
|
// increment s.closed to force further calls to result in EBADF before also
|
|
// closing the file descriptor to unblock any outstanding operations.
|
|
//
|
|
// Because other operations simply check for s.closed != 0, we will permit
|
|
// double Close, which would increment s.closed beyond 1.
|
|
if atomic.AddUint32(&c.closed, 1) != 1 {
|
|
// Multiple Close calls.
|
|
return nil
|
|
}
|
|
|
|
return os.NewSyscallError("close", c.fd.Close())
|
|
}
|
|
|
|
// CloseRead shuts down the reading side of the Conn. Most callers should just
|
|
// use Close.
|
|
func (c *Conn) CloseRead() error { return c.Shutdown(unix.SHUT_RD) }
|
|
|
|
// CloseWrite shuts down the writing side of the Conn. Most callers should just
|
|
// use Close.
|
|
func (c *Conn) CloseWrite() error { return c.Shutdown(unix.SHUT_WR) }
|
|
|
|
// Read reads directly from the underlying file descriptor.
|
|
func (c *Conn) Read(b []byte) (int, error) { return c.fd.Read(b) }
|
|
|
|
// ReadContext reads from the underlying file descriptor with added support for
|
|
// context cancelation.
|
|
func (c *Conn) ReadContext(ctx context.Context, b []byte) (int, error) {
|
|
if c.facts.isStream && len(b) > maxRW {
|
|
b = b[:maxRW]
|
|
}
|
|
|
|
n, err := readT(c, ctx, "read", func(fd int) (int, error) {
|
|
return unix.Read(fd, b)
|
|
})
|
|
if n == 0 && err == nil && c.facts.zeroReadIsEOF {
|
|
return 0, io.EOF
|
|
}
|
|
|
|
return n, os.NewSyscallError("read", err)
|
|
}
|
|
|
|
// Write writes directly to the underlying file descriptor.
|
|
func (c *Conn) Write(b []byte) (int, error) { return c.fd.Write(b) }
|
|
|
|
// WriteContext writes to the underlying file descriptor with added support for
|
|
// context cancelation.
|
|
func (c *Conn) WriteContext(ctx context.Context, b []byte) (int, error) {
|
|
var (
|
|
n, nn int
|
|
err error
|
|
)
|
|
|
|
doErr := c.write(ctx, "write", func(fd int) error {
|
|
max := len(b)
|
|
if c.facts.isStream && max-nn > maxRW {
|
|
max = nn + maxRW
|
|
}
|
|
|
|
n, err = unix.Write(fd, b[nn:max])
|
|
if n > 0 {
|
|
nn += n
|
|
}
|
|
if nn == len(b) {
|
|
return err
|
|
}
|
|
if n == 0 && err == nil {
|
|
err = io.ErrUnexpectedEOF
|
|
return nil
|
|
}
|
|
|
|
return err
|
|
})
|
|
if doErr != nil {
|
|
return 0, doErr
|
|
}
|
|
|
|
return nn, os.NewSyscallError("write", err)
|
|
}
|
|
|
|
// SetDeadline sets both the read and write deadlines associated with the Conn.
|
|
func (c *Conn) SetDeadline(t time.Time) error { return c.fd.SetDeadline(t) }
|
|
|
|
// SetReadDeadline sets the read deadline associated with the Conn.
|
|
func (c *Conn) SetReadDeadline(t time.Time) error { return c.fd.SetReadDeadline(t) }
|
|
|
|
// SetWriteDeadline sets the write deadline associated with the Conn.
|
|
func (c *Conn) SetWriteDeadline(t time.Time) error { return c.fd.SetWriteDeadline(t) }
|
|
|
|
// ReadBuffer gets the size of the operating system's receive buffer associated
|
|
// with the Conn.
|
|
func (c *Conn) ReadBuffer() (int, error) {
|
|
return c.GetsockoptInt(unix.SOL_SOCKET, unix.SO_RCVBUF)
|
|
}
|
|
|
|
// WriteBuffer gets the size of the operating system's transmit buffer
|
|
// associated with the Conn.
|
|
func (c *Conn) WriteBuffer() (int, error) {
|
|
return c.GetsockoptInt(unix.SOL_SOCKET, unix.SO_SNDBUF)
|
|
}
|
|
|
|
// SetReadBuffer sets the size of the operating system's receive buffer
|
|
// associated with the Conn.
|
|
//
|
|
// When called with elevated privileges on Linux, the SO_RCVBUFFORCE option will
|
|
// be used to override operating system limits. Otherwise SO_RCVBUF is used
|
|
// (which obeys operating system limits).
|
|
func (c *Conn) SetReadBuffer(bytes int) error { return c.setReadBuffer(bytes) }
|
|
|
|
// SetWriteBuffer sets the size of the operating system's transmit buffer
|
|
// associated with the Conn.
|
|
//
|
|
// When called with elevated privileges on Linux, the SO_SNDBUFFORCE option will
|
|
// be used to override operating system limits. Otherwise SO_SNDBUF is used
|
|
// (which obeys operating system limits).
|
|
func (c *Conn) SetWriteBuffer(bytes int) error { return c.setWriteBuffer(bytes) }
|
|
|
|
// SyscallConn returns a raw network connection. This implements the
|
|
// syscall.Conn interface.
|
|
//
|
|
// SyscallConn is intended for advanced use cases, such as getting and setting
|
|
// arbitrary socket options using the socket's file descriptor. If possible,
|
|
// those operations should be performed using methods on Conn instead.
|
|
//
|
|
// Once invoked, it is the caller's responsibility to ensure that operations
|
|
// performed using Conn and the syscall.RawConn do not conflict with each other.
|
|
func (c *Conn) SyscallConn() (syscall.RawConn, error) {
|
|
if atomic.LoadUint32(&c.closed) != 0 {
|
|
return nil, os.NewSyscallError("syscallconn", unix.EBADF)
|
|
}
|
|
|
|
// TODO(mdlayher): mutex or similar to enforce syscall.RawConn contract of
|
|
// FD remaining valid for duration of calls?
|
|
return c.rc, nil
|
|
}
|
|
|
|
// Socket wraps the socket(2) system call to produce a Conn. domain, typ, and
|
|
// proto are passed directly to socket(2), and name should be a unique name for
|
|
// the socket type such as "netlink" or "vsock".
|
|
//
|
|
// The cfg parameter specifies optional configuration for the Conn. If nil, no
|
|
// additional configuration will be applied.
|
|
//
|
|
// If the operating system supports SOCK_CLOEXEC and SOCK_NONBLOCK, they are
|
|
// automatically applied to typ to mirror the standard library's socket flag
|
|
// behaviors.
|
|
func Socket(domain, typ, proto int, name string, cfg *Config) (*Conn, error) {
|
|
if cfg == nil {
|
|
cfg = &Config{}
|
|
}
|
|
|
|
if cfg.NetNS == 0 {
|
|
// Non-Linux or no network namespace.
|
|
return socket(domain, typ, proto, name)
|
|
}
|
|
|
|
// Linux only: create Conn in the specified network namespace.
|
|
return withNetNS(cfg.NetNS, func() (*Conn, error) {
|
|
return socket(domain, typ, proto, name)
|
|
})
|
|
}
|
|
|
|
// socket is the internal, cross-platform entry point for socket(2).
|
|
func socket(domain, typ, proto int, name string) (*Conn, error) {
|
|
var (
|
|
fd int
|
|
err error
|
|
)
|
|
|
|
for {
|
|
fd, err = unix.Socket(domain, typ|socketFlags, proto)
|
|
switch {
|
|
case err == nil:
|
|
// Some OSes already set CLOEXEC with typ.
|
|
if !flagCLOEXEC {
|
|
unix.CloseOnExec(fd)
|
|
}
|
|
|
|
// No error, prepare the Conn.
|
|
return New(fd, name)
|
|
case !ready(err):
|
|
// System call interrupted or not ready, try again.
|
|
continue
|
|
case err == unix.EINVAL, err == unix.EPROTONOSUPPORT:
|
|
// On Linux, SOCK_NONBLOCK and SOCK_CLOEXEC were introduced in
|
|
// 2.6.27. On FreeBSD, both flags were introduced in FreeBSD 10.
|
|
// EINVAL and EPROTONOSUPPORT check for earlier versions of these
|
|
// OSes respectively.
|
|
//
|
|
// Mirror what the standard library does when creating file
|
|
// descriptors: avoid racing a fork/exec with the creation of new
|
|
// file descriptors, so that child processes do not inherit socket
|
|
// file descriptors unexpectedly.
|
|
//
|
|
// For a more thorough explanation, see similar work in the Go tree:
|
|
// func sysSocket in net/sock_cloexec.go, as well as the detailed
|
|
// comment in syscall/exec_unix.go.
|
|
syscall.ForkLock.RLock()
|
|
fd, err = unix.Socket(domain, typ, proto)
|
|
if err != nil {
|
|
syscall.ForkLock.RUnlock()
|
|
return nil, os.NewSyscallError("socket", err)
|
|
}
|
|
unix.CloseOnExec(fd)
|
|
syscall.ForkLock.RUnlock()
|
|
|
|
return New(fd, name)
|
|
default:
|
|
// Unhandled error.
|
|
return nil, os.NewSyscallError("socket", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// FileConn returns a copy of the network connection corresponding to the open
|
|
// file. It is the caller's responsibility to close the file when finished.
|
|
// Closing the Conn does not affect the File, and closing the File does not
|
|
// affect the Conn.
|
|
func FileConn(f *os.File, name string) (*Conn, error) {
|
|
// First we'll try to do fctnl(2) with F_DUPFD_CLOEXEC because we can dup
|
|
// the file descriptor and set the flag in one syscall.
|
|
fd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
|
|
switch err {
|
|
case nil:
|
|
// OK, ready to set up non-blocking I/O.
|
|
return New(fd, name)
|
|
case unix.EINVAL:
|
|
// The kernel rejected our fcntl(2), fall back to separate dup(2) and
|
|
// setting close on exec.
|
|
//
|
|
// Mirror what the standard library does when creating file descriptors:
|
|
// avoid racing a fork/exec with the creation of new file descriptors,
|
|
// so that child processes do not inherit socket file descriptors
|
|
// unexpectedly.
|
|
syscall.ForkLock.RLock()
|
|
fd, err := unix.Dup(fd)
|
|
if err != nil {
|
|
syscall.ForkLock.RUnlock()
|
|
return nil, os.NewSyscallError("dup", err)
|
|
}
|
|
unix.CloseOnExec(fd)
|
|
syscall.ForkLock.RUnlock()
|
|
|
|
return New(fd, name)
|
|
default:
|
|
// Any other errors.
|
|
return nil, os.NewSyscallError("fcntl", err)
|
|
}
|
|
}
|
|
|
|
// New wraps an existing file descriptor to create a Conn. name should be a
|
|
// unique name for the socket type such as "netlink" or "vsock".
|
|
//
|
|
// Most callers should use Socket or FileConn to construct a Conn. New is
|
|
// intended for integrating with specific system calls which provide a file
|
|
// descriptor that supports asynchronous I/O. The file descriptor is immediately
|
|
// set to nonblocking mode and registered with Go's runtime network poller for
|
|
// future I/O operations.
|
|
//
|
|
// Unlike FileConn, New does not duplicate the existing file descriptor in any
|
|
// way. The returned Conn takes ownership of the underlying file descriptor.
|
|
func New(fd int, name string) (*Conn, error) {
|
|
// All Conn I/O is nonblocking for integration with Go's runtime network
|
|
// poller. Depending on the OS this might already be set but it can't hurt
|
|
// to set it again.
|
|
if err := unix.SetNonblock(fd, true); err != nil {
|
|
return nil, os.NewSyscallError("setnonblock", err)
|
|
}
|
|
|
|
// os.NewFile registers the non-blocking file descriptor with the runtime
|
|
// poller, which is then used for most subsequent operations except those
|
|
// that require raw I/O via SyscallConn.
|
|
//
|
|
// See also: https://golang.org/pkg/os/#NewFile
|
|
f := os.NewFile(uintptr(fd), name)
|
|
rc, err := f.SyscallConn()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
c := &Conn{
|
|
name: name,
|
|
fd: f,
|
|
rc: rc,
|
|
}
|
|
|
|
// Probe the file descriptor for socket settings.
|
|
sotype, err := c.GetsockoptInt(unix.SOL_SOCKET, unix.SO_TYPE)
|
|
switch {
|
|
case err == nil:
|
|
// File is a socket, check its properties.
|
|
c.facts = facts{
|
|
isStream: sotype == unix.SOCK_STREAM,
|
|
zeroReadIsEOF: sotype != unix.SOCK_DGRAM && sotype != unix.SOCK_RAW,
|
|
}
|
|
case errors.Is(err, unix.ENOTSOCK):
|
|
// File is not a socket, treat it as a regular file.
|
|
c.facts = facts{
|
|
isStream: true,
|
|
zeroReadIsEOF: true,
|
|
}
|
|
default:
|
|
return nil, err
|
|
}
|
|
|
|
return c, nil
|
|
}
|
|
|
|
// Low-level methods which provide raw system call access.
|
|
|
|
// Accept wraps accept(2) or accept4(2) depending on the operating system, but
|
|
// returns a Conn for the accepted connection rather than a raw file descriptor.
|
|
//
|
|
// If the operating system supports accept4(2) (which allows flags),
|
|
// SOCK_CLOEXEC and SOCK_NONBLOCK are automatically applied to flags to mirror
|
|
// the standard library's socket flag behaviors.
|
|
//
|
|
// If the operating system only supports accept(2) (which does not allow flags)
|
|
// and flags is not zero, an error will be returned.
|
|
//
|
|
// Accept obeys context cancelation and uses the deadline set on the context to
|
|
// cancel accepting the next connection. If a deadline is set on ctx, this
|
|
// deadline will override any previous deadlines set using SetDeadline or
|
|
// SetReadDeadline. Upon return, the read deadline is cleared.
|
|
func (c *Conn) Accept(ctx context.Context, flags int) (*Conn, unix.Sockaddr, error) {
|
|
type ret struct {
|
|
nfd int
|
|
sa unix.Sockaddr
|
|
}
|
|
|
|
r, err := readT(c, ctx, sysAccept, func(fd int) (ret, error) {
|
|
// Either accept(2) or accept4(2) depending on the OS.
|
|
nfd, sa, err := accept(fd, flags|socketFlags)
|
|
return ret{nfd, sa}, err
|
|
})
|
|
if err != nil {
|
|
// internal/poll, context error, or user function error.
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Successfully accepted a connection, wrap it in a Conn for use by the
|
|
// caller.
|
|
ac, err := New(r.nfd, c.name)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
return ac, r.sa, nil
|
|
}
|
|
|
|
// Bind wraps bind(2).
|
|
func (c *Conn) Bind(sa unix.Sockaddr) error {
|
|
return c.control(context.Background(), "bind", func(fd int) error {
|
|
return unix.Bind(fd, sa)
|
|
})
|
|
}
|
|
|
|
// Connect wraps connect(2). In order to verify that the underlying socket is
|
|
// connected to a remote peer, Connect calls getpeername(2) and returns the
|
|
// unix.Sockaddr from that call.
|
|
//
|
|
// Connect obeys context cancelation and uses the deadline set on the context to
|
|
// cancel connecting to a remote peer. If a deadline is set on ctx, this
|
|
// deadline will override any previous deadlines set using SetDeadline or
|
|
// SetWriteDeadline. Upon return, the write deadline is cleared.
|
|
func (c *Conn) Connect(ctx context.Context, sa unix.Sockaddr) (unix.Sockaddr, error) {
|
|
const op = "connect"
|
|
|
|
// TODO(mdlayher): it would seem that trying to connect to unbound vsock
|
|
// listeners by calling Connect multiple times results in ECONNRESET for the
|
|
// first and nil error for subsequent calls. Do we need to memoize the
|
|
// error? Check what the stdlib behavior is.
|
|
|
|
var (
|
|
// Track progress between invocations of the write closure. We don't
|
|
// have an explicit WaitWrite call like internal/poll does, so we have
|
|
// to wait until the runtime calls the closure again to indicate we can
|
|
// write.
|
|
progress uint32
|
|
|
|
// Capture closure sockaddr and error.
|
|
rsa unix.Sockaddr
|
|
err error
|
|
)
|
|
|
|
doErr := c.write(ctx, op, func(fd int) error {
|
|
if atomic.AddUint32(&progress, 1) == 1 {
|
|
// First call: initiate connect.
|
|
return unix.Connect(fd, sa)
|
|
}
|
|
|
|
// Subsequent calls: the runtime network poller indicates fd is
|
|
// writable. Check for errno.
|
|
errno, gerr := c.GetsockoptInt(unix.SOL_SOCKET, unix.SO_ERROR)
|
|
if gerr != nil {
|
|
return gerr
|
|
}
|
|
if errno != 0 {
|
|
// Connection is still not ready or failed. If errno indicates
|
|
// the socket is not ready, we will wait for the next write
|
|
// event. Otherwise we propagate this errno back to the as a
|
|
// permanent error.
|
|
uerr := unix.Errno(errno)
|
|
err = uerr
|
|
return uerr
|
|
}
|
|
|
|
// According to internal/poll, it's possible for the runtime network
|
|
// poller to spuriously wake us and return errno 0 for SO_ERROR.
|
|
// Make sure we are actually connected to a peer.
|
|
peer, err := c.Getpeername()
|
|
if err != nil {
|
|
// internal/poll unconditionally goes back to WaitWrite.
|
|
// Synthesize an error that will do the same for us.
|
|
return unix.EAGAIN
|
|
}
|
|
|
|
// Connection complete.
|
|
rsa = peer
|
|
return nil
|
|
})
|
|
if doErr != nil {
|
|
// internal/poll or context error.
|
|
return nil, doErr
|
|
}
|
|
|
|
if err == unix.EISCONN {
|
|
// TODO(mdlayher): is this block obsolete with the addition of the
|
|
// getsockopt SO_ERROR check above?
|
|
//
|
|
// EISCONN is reported if the socket is already established and should
|
|
// not be treated as an error.
|
|
// - Darwin reports this for at least TCP sockets
|
|
// - Linux reports this for at least AF_VSOCK sockets
|
|
return rsa, nil
|
|
}
|
|
|
|
return rsa, os.NewSyscallError(op, err)
|
|
}
|
|
|
|
// Getsockname wraps getsockname(2).
|
|
func (c *Conn) Getsockname() (unix.Sockaddr, error) {
|
|
return controlT(c, context.Background(), "getsockname", unix.Getsockname)
|
|
}
|
|
|
|
// Getpeername wraps getpeername(2).
|
|
func (c *Conn) Getpeername() (unix.Sockaddr, error) {
|
|
return controlT(c, context.Background(), "getpeername", unix.Getpeername)
|
|
}
|
|
|
|
// GetsockoptInt wraps getsockopt(2) for integer values.
|
|
func (c *Conn) GetsockoptInt(level, opt int) (int, error) {
|
|
return controlT(c, context.Background(), "getsockopt", func(fd int) (int, error) {
|
|
return unix.GetsockoptInt(fd, level, opt)
|
|
})
|
|
}
|
|
|
|
// Listen wraps listen(2).
|
|
func (c *Conn) Listen(n int) error {
|
|
return c.control(context.Background(), "listen", func(fd int) error {
|
|
return unix.Listen(fd, n)
|
|
})
|
|
}
|
|
|
|
// Recvmsg wraps recvmsg(2).
|
|
func (c *Conn) Recvmsg(ctx context.Context, p, oob []byte, flags int) (int, int, int, unix.Sockaddr, error) {
|
|
type ret struct {
|
|
n, oobn, recvflags int
|
|
from unix.Sockaddr
|
|
}
|
|
|
|
r, err := readT(c, ctx, "recvmsg", func(fd int) (ret, error) {
|
|
n, oobn, recvflags, from, err := unix.Recvmsg(fd, p, oob, flags)
|
|
return ret{n, oobn, recvflags, from}, err
|
|
})
|
|
if r.n == 0 && err == nil && c.facts.zeroReadIsEOF {
|
|
return 0, 0, 0, nil, io.EOF
|
|
}
|
|
|
|
return r.n, r.oobn, r.recvflags, r.from, err
|
|
}
|
|
|
|
// Recvfrom wraps recvfrom(2).
|
|
func (c *Conn) Recvfrom(ctx context.Context, p []byte, flags int) (int, unix.Sockaddr, error) {
|
|
type ret struct {
|
|
n int
|
|
addr unix.Sockaddr
|
|
}
|
|
|
|
out, err := readT(c, ctx, "recvfrom", func(fd int) (ret, error) {
|
|
n, addr, err := unix.Recvfrom(fd, p, flags)
|
|
return ret{n, addr}, err
|
|
})
|
|
if out.n == 0 && err == nil && c.facts.zeroReadIsEOF {
|
|
return 0, nil, io.EOF
|
|
}
|
|
|
|
return out.n, out.addr, err
|
|
}
|
|
|
|
// Sendmsg wraps sendmsg(2).
|
|
func (c *Conn) Sendmsg(ctx context.Context, p, oob []byte, to unix.Sockaddr, flags int) (int, error) {
|
|
return writeT(c, ctx, "sendmsg", func(fd int) (int, error) {
|
|
return unix.SendmsgN(fd, p, oob, to, flags)
|
|
})
|
|
}
|
|
|
|
// Sendto wraps sendto(2).
|
|
func (c *Conn) Sendto(ctx context.Context, p []byte, flags int, to unix.Sockaddr) error {
|
|
return c.write(ctx, "sendto", func(fd int) error {
|
|
return unix.Sendto(fd, p, flags, to)
|
|
})
|
|
}
|
|
|
|
// SetsockoptInt wraps setsockopt(2) for integer values.
|
|
func (c *Conn) SetsockoptInt(level, opt, value int) error {
|
|
return c.control(context.Background(), "setsockopt", func(fd int) error {
|
|
return unix.SetsockoptInt(fd, level, opt, value)
|
|
})
|
|
}
|
|
|
|
// Shutdown wraps shutdown(2).
|
|
func (c *Conn) Shutdown(how int) error {
|
|
return c.control(context.Background(), "shutdown", func(fd int) error {
|
|
return unix.Shutdown(fd, how)
|
|
})
|
|
}
|
|
|
|
// Conn low-level read/write/control functions. These functions mirror the
|
|
// syscall.RawConn APIs but the input closures return errors rather than
|
|
// booleans.
|
|
|
|
// read wraps readT to execute a function and capture its error result. This is
|
|
// a convenience wrapper for functions which don't return any extra values.
|
|
func (c *Conn) read(ctx context.Context, op string, f func(fd int) error) error {
|
|
_, err := readT(c, ctx, op, func(fd int) (struct{}, error) {
|
|
return struct{}{}, f(fd)
|
|
})
|
|
return err
|
|
}
|
|
|
|
// write executes f, a write function, against the associated file descriptor.
|
|
// op is used to create an *os.SyscallError if the file descriptor is closed.
|
|
func (c *Conn) write(ctx context.Context, op string, f func(fd int) error) error {
|
|
_, err := writeT(c, ctx, op, func(fd int) (struct{}, error) {
|
|
return struct{}{}, f(fd)
|
|
})
|
|
return err
|
|
}
|
|
|
|
// readT executes c.rc.Read for op using the input function, returning a newly
|
|
// allocated result T.
|
|
func readT[T any](c *Conn, ctx context.Context, op string, f func(fd int) (T, error)) (T, error) {
|
|
return rwT(c, rwContext[T]{
|
|
Context: ctx,
|
|
Type: read,
|
|
Op: op,
|
|
Do: f,
|
|
})
|
|
}
|
|
|
|
// writeT executes c.rc.Write for op using the input function, returning a newly
|
|
// allocated result T.
|
|
func writeT[T any](c *Conn, ctx context.Context, op string, f func(fd int) (T, error)) (T, error) {
|
|
return rwT(c, rwContext[T]{
|
|
Context: ctx,
|
|
Type: write,
|
|
Op: op,
|
|
Do: f,
|
|
})
|
|
}
|
|
|
|
// readWrite indicates if an operation intends to read or write.
|
|
type readWrite bool
|
|
|
|
// Possible readWrite values.
|
|
const (
|
|
read readWrite = false
|
|
write readWrite = true
|
|
)
|
|
|
|
// An rwContext provides arguments to rwT.
|
|
type rwContext[T any] struct {
|
|
// The caller's context passed for cancelation.
|
|
Context context.Context
|
|
|
|
// The type of an operation: read or write.
|
|
Type readWrite
|
|
|
|
// The name of the operation used in errors.
|
|
Op string
|
|
|
|
// The actual function to perform.
|
|
Do func(fd int) (T, error)
|
|
}
|
|
|
|
// rwT executes c.rc.Read or c.rc.Write (depending on the value of rw.Type) for
|
|
// rw.Op using the input function, returning a newly allocated result T.
|
|
//
|
|
// It obeys context cancelation and the rw.Context must not be nil.
|
|
func rwT[T any](c *Conn, rw rwContext[T]) (T, error) {
|
|
if atomic.LoadUint32(&c.closed) != 0 {
|
|
// If the file descriptor is already closed, do nothing.
|
|
return *new(T), os.NewSyscallError(rw.Op, unix.EBADF)
|
|
}
|
|
|
|
if err := rw.Context.Err(); err != nil {
|
|
// Early exit due to context cancel.
|
|
return *new(T), os.NewSyscallError(rw.Op, err)
|
|
}
|
|
|
|
var (
|
|
// The read or write function used to access the runtime network poller.
|
|
poll func(func(uintptr) bool) error
|
|
|
|
// The read or write function used to set the matching deadline.
|
|
deadline func(time.Time) error
|
|
)
|
|
|
|
if rw.Type == write {
|
|
poll = c.rc.Write
|
|
deadline = c.SetWriteDeadline
|
|
} else {
|
|
poll = c.rc.Read
|
|
deadline = c.SetReadDeadline
|
|
}
|
|
|
|
var (
|
|
// Whether or not the context carried a deadline we are actively using
|
|
// for cancelation.
|
|
setDeadline bool
|
|
|
|
// Signals for the cancelation watcher goroutine.
|
|
wg sync.WaitGroup
|
|
doneC = make(chan struct{})
|
|
|
|
// Atomic: reports whether we have to disarm the deadline.
|
|
//
|
|
// TODO(mdlayher): switch back to atomic.Bool when we drop support for
|
|
// Go 1.18.
|
|
needDisarm int64
|
|
)
|
|
|
|
// On cancel, clean up the watcher.
|
|
defer func() {
|
|
close(doneC)
|
|
wg.Wait()
|
|
}()
|
|
|
|
if d, ok := rw.Context.Deadline(); ok {
|
|
// The context has an explicit deadline. We will use it for cancelation
|
|
// but disarm it after poll for the next call.
|
|
if err := deadline(d); err != nil {
|
|
return *new(T), err
|
|
}
|
|
setDeadline = true
|
|
atomic.AddInt64(&needDisarm, 1)
|
|
} else {
|
|
// The context does not have an explicit deadline. We have to watch for
|
|
// cancelation so we can propagate that signal to immediately unblock
|
|
// the runtime network poller.
|
|
//
|
|
// TODO(mdlayher): is it possible to detect a background context vs a
|
|
// context with possible future cancel?
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
|
|
select {
|
|
case <-rw.Context.Done():
|
|
// Cancel the operation. Make the caller disarm after poll
|
|
// returns.
|
|
atomic.AddInt64(&needDisarm, 1)
|
|
_ = deadline(time.Unix(0, 1))
|
|
case <-doneC:
|
|
// Nothing to do.
|
|
}
|
|
}()
|
|
}
|
|
|
|
var (
|
|
t T
|
|
err error
|
|
)
|
|
|
|
pollErr := poll(func(fd uintptr) bool {
|
|
t, err = rw.Do(int(fd))
|
|
return ready(err)
|
|
})
|
|
|
|
if atomic.LoadInt64(&needDisarm) > 0 {
|
|
_ = deadline(time.Time{})
|
|
}
|
|
|
|
if pollErr != nil {
|
|
if rw.Context.Err() != nil || (setDeadline && errors.Is(pollErr, os.ErrDeadlineExceeded)) {
|
|
// The caller canceled the operation or we set a deadline internally
|
|
// and it was reached.
|
|
//
|
|
// Unpack a plain context error. We wait for the context to be done
|
|
// to synchronize state externally. Otherwise we have noticed I/O
|
|
// timeout wakeups when we set a deadline but the context was not
|
|
// yet marked done.
|
|
<-rw.Context.Done()
|
|
return *new(T), os.NewSyscallError(rw.Op, rw.Context.Err())
|
|
}
|
|
|
|
// Error from syscall.RawConn methods. Conventionally the standard
|
|
// library does not wrap internal/poll errors in os.NewSyscallError.
|
|
return *new(T), pollErr
|
|
}
|
|
|
|
// Result from user function.
|
|
return t, os.NewSyscallError(rw.Op, err)
|
|
}
|
|
|
|
// control executes Conn.control for op using the input function.
|
|
func (c *Conn) control(ctx context.Context, op string, f func(fd int) error) error {
|
|
_, err := controlT(c, ctx, op, func(fd int) (struct{}, error) {
|
|
return struct{}{}, f(fd)
|
|
})
|
|
return err
|
|
}
|
|
|
|
// controlT executes c.rc.Control for op using the input function, returning a
|
|
// newly allocated result T.
|
|
func controlT[T any](c *Conn, ctx context.Context, op string, f func(fd int) (T, error)) (T, error) {
|
|
if atomic.LoadUint32(&c.closed) != 0 {
|
|
// If the file descriptor is already closed, do nothing.
|
|
return *new(T), os.NewSyscallError(op, unix.EBADF)
|
|
}
|
|
|
|
var (
|
|
t T
|
|
err error
|
|
)
|
|
|
|
doErr := c.rc.Control(func(fd uintptr) {
|
|
// Repeatedly attempt the syscall(s) invoked by f until completion is
|
|
// indicated by the return value of ready or the context is canceled.
|
|
//
|
|
// The last values for t and err are captured outside of the closure for
|
|
// use when the loop breaks.
|
|
for {
|
|
if err = ctx.Err(); err != nil {
|
|
// Early exit due to context cancel.
|
|
return
|
|
}
|
|
|
|
t, err = f(int(fd))
|
|
if ready(err) {
|
|
return
|
|
}
|
|
}
|
|
})
|
|
if doErr != nil {
|
|
// Error from syscall.RawConn methods. Conventionally the standard
|
|
// library does not wrap internal/poll errors in os.NewSyscallError.
|
|
return *new(T), doErr
|
|
}
|
|
|
|
// Result from user function.
|
|
return t, os.NewSyscallError(op, err)
|
|
}
|
|
|
|
// ready indicates readiness based on the value of err.
|
|
func ready(err error) bool {
|
|
switch err {
|
|
case unix.EAGAIN, unix.EINPROGRESS, unix.EINTR:
|
|
// When a socket is in non-blocking mode, we might see a variety of errors:
|
|
// - EAGAIN: most common case for a socket read not being ready
|
|
// - EINPROGRESS: reported by some sockets when first calling connect
|
|
// - EINTR: system call interrupted, more frequently occurs in Go 1.14+
|
|
// because goroutines can be asynchronously preempted
|
|
//
|
|
// Return false to let the poller wait for readiness. See the source code
|
|
// for internal/poll.FD.RawRead for more details.
|
|
return false
|
|
default:
|
|
// Ready regardless of whether there was an error or no error.
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Darwin and FreeBSD can't read or write 2GB+ files at a time,
|
|
// even on 64-bit systems.
|
|
// The same is true of socket implementations on many systems.
|
|
// See golang.org/issue/7812 and golang.org/issue/16266.
|
|
// Use 1GB instead of, say, 2GB-1, to keep subsequent reads aligned.
|
|
const maxRW = 1 << 30
|