Cory McWilliams
ae0a8b0a33
git-svn-id: https://www.unprompted.com/svn/projects/tildefriends/trunk@4336 ed5197a5-7fde-0310-b194-c3ffbd925b24
2517 lines
63 KiB
C
2517 lines
63 KiB
C
/* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to
|
|
* deal in the Software without restriction, including without limitation the
|
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
* sell copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
/* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
|
|
* EPOLL* counterparts. We use the POLL* variants in this file because that
|
|
* is what libuv uses elsewhere.
|
|
*/
|
|
|
|
#include "uv.h"
|
|
#include "internal.h"
|
|
|
|
#include <inttypes.h>
|
|
#include <stdatomic.h>
|
|
#include <stddef.h> /* offsetof */
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <errno.h>
|
|
|
|
#include <fcntl.h>
|
|
#include <net/if.h>
|
|
#include <sys/epoll.h>
|
|
#include <sys/inotify.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/param.h>
|
|
#include <sys/prctl.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/sysinfo.h>
|
|
#include <sys/sysmacros.h>
|
|
#include <sys/types.h>
|
|
#include <sys/utsname.h>
|
|
#include <time.h>
|
|
#include <unistd.h>
|
|
|
|
#ifndef __NR_io_uring_setup
|
|
# define __NR_io_uring_setup 425
|
|
#endif
|
|
|
|
#ifndef __NR_io_uring_enter
|
|
# define __NR_io_uring_enter 426
|
|
#endif
|
|
|
|
#ifndef __NR_io_uring_register
|
|
# define __NR_io_uring_register 427
|
|
#endif
|
|
|
|
#ifndef __NR_copy_file_range
|
|
# if defined(__x86_64__)
|
|
# define __NR_copy_file_range 326
|
|
# elif defined(__i386__)
|
|
# define __NR_copy_file_range 377
|
|
# elif defined(__s390__)
|
|
# define __NR_copy_file_range 375
|
|
# elif defined(__arm__)
|
|
# define __NR_copy_file_range 391
|
|
# elif defined(__aarch64__)
|
|
# define __NR_copy_file_range 285
|
|
# elif defined(__powerpc__)
|
|
# define __NR_copy_file_range 379
|
|
# elif defined(__arc__)
|
|
# define __NR_copy_file_range 285
|
|
# endif
|
|
#endif /* __NR_copy_file_range */
|
|
|
|
#ifndef __NR_statx
|
|
# if defined(__x86_64__)
|
|
# define __NR_statx 332
|
|
# elif defined(__i386__)
|
|
# define __NR_statx 383
|
|
# elif defined(__aarch64__)
|
|
# define __NR_statx 397
|
|
# elif defined(__arm__)
|
|
# define __NR_statx 397
|
|
# elif defined(__ppc__)
|
|
# define __NR_statx 383
|
|
# elif defined(__s390__)
|
|
# define __NR_statx 379
|
|
# endif
|
|
#endif /* __NR_statx */
|
|
|
|
#ifndef __NR_getrandom
|
|
# if defined(__x86_64__)
|
|
# define __NR_getrandom 318
|
|
# elif defined(__i386__)
|
|
# define __NR_getrandom 355
|
|
# elif defined(__aarch64__)
|
|
# define __NR_getrandom 384
|
|
# elif defined(__arm__)
|
|
# define __NR_getrandom 384
|
|
# elif defined(__ppc__)
|
|
# define __NR_getrandom 359
|
|
# elif defined(__s390__)
|
|
# define __NR_getrandom 349
|
|
# endif
|
|
#endif /* __NR_getrandom */
|
|
|
|
#define HAVE_IFADDRS_H 1
|
|
|
|
# if defined(__ANDROID_API__) && __ANDROID_API__ < 24
|
|
# undef HAVE_IFADDRS_H
|
|
#endif
|
|
|
|
#ifdef __UCLIBC__
|
|
# if __UCLIBC_MAJOR__ < 0 && __UCLIBC_MINOR__ < 9 && __UCLIBC_SUBLEVEL__ < 32
|
|
# undef HAVE_IFADDRS_H
|
|
# endif
|
|
#endif
|
|
|
|
#ifdef HAVE_IFADDRS_H
|
|
# include <ifaddrs.h>
|
|
# include <sys/socket.h>
|
|
# include <net/ethernet.h>
|
|
# include <netpacket/packet.h>
|
|
#endif /* HAVE_IFADDRS_H */
|
|
|
|
enum {
|
|
UV__IORING_SETUP_SQPOLL = 2u,
|
|
};
|
|
|
|
enum {
|
|
UV__IORING_FEAT_SINGLE_MMAP = 1u,
|
|
UV__IORING_FEAT_NODROP = 2u,
|
|
UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */
|
|
};
|
|
|
|
enum {
|
|
UV__IORING_OP_READV = 1,
|
|
UV__IORING_OP_WRITEV = 2,
|
|
UV__IORING_OP_FSYNC = 3,
|
|
UV__IORING_OP_OPENAT = 18,
|
|
UV__IORING_OP_CLOSE = 19,
|
|
UV__IORING_OP_STATX = 21,
|
|
UV__IORING_OP_EPOLL_CTL = 29,
|
|
UV__IORING_OP_RENAMEAT = 35,
|
|
UV__IORING_OP_UNLINKAT = 36,
|
|
UV__IORING_OP_MKDIRAT = 37,
|
|
UV__IORING_OP_SYMLINKAT = 38,
|
|
UV__IORING_OP_LINKAT = 39,
|
|
};
|
|
|
|
enum {
|
|
UV__IORING_ENTER_GETEVENTS = 1u,
|
|
UV__IORING_ENTER_SQ_WAKEUP = 2u,
|
|
};
|
|
|
|
enum {
|
|
UV__IORING_SQ_NEED_WAKEUP = 1u,
|
|
UV__IORING_SQ_CQ_OVERFLOW = 2u,
|
|
};
|
|
|
|
enum {
|
|
UV__MKDIRAT_SYMLINKAT_LINKAT = 1u,
|
|
};
|
|
|
|
struct uv__io_cqring_offsets {
|
|
uint32_t head;
|
|
uint32_t tail;
|
|
uint32_t ring_mask;
|
|
uint32_t ring_entries;
|
|
uint32_t overflow;
|
|
uint32_t cqes;
|
|
uint64_t reserved0;
|
|
uint64_t reserved1;
|
|
};
|
|
|
|
STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
|
|
|
|
struct uv__io_sqring_offsets {
|
|
uint32_t head;
|
|
uint32_t tail;
|
|
uint32_t ring_mask;
|
|
uint32_t ring_entries;
|
|
uint32_t flags;
|
|
uint32_t dropped;
|
|
uint32_t array;
|
|
uint32_t reserved0;
|
|
uint64_t reserved1;
|
|
};
|
|
|
|
STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
|
|
|
|
struct uv__io_uring_cqe {
|
|
uint64_t user_data;
|
|
int32_t res;
|
|
uint32_t flags;
|
|
};
|
|
|
|
STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
|
|
|
|
struct uv__io_uring_sqe {
|
|
uint8_t opcode;
|
|
uint8_t flags;
|
|
uint16_t ioprio;
|
|
int32_t fd;
|
|
union {
|
|
uint64_t off;
|
|
uint64_t addr2;
|
|
};
|
|
union {
|
|
uint64_t addr;
|
|
};
|
|
uint32_t len;
|
|
union {
|
|
uint32_t rw_flags;
|
|
uint32_t fsync_flags;
|
|
uint32_t open_flags;
|
|
uint32_t statx_flags;
|
|
};
|
|
uint64_t user_data;
|
|
union {
|
|
uint16_t buf_index;
|
|
uint64_t pad[3];
|
|
};
|
|
};
|
|
|
|
STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
|
|
STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
|
|
STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
|
|
STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
|
|
STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
|
|
STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
|
|
STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
|
|
STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
|
|
STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
|
|
STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
|
|
STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
|
|
|
|
struct uv__io_uring_params {
|
|
uint32_t sq_entries;
|
|
uint32_t cq_entries;
|
|
uint32_t flags;
|
|
uint32_t sq_thread_cpu;
|
|
uint32_t sq_thread_idle;
|
|
uint32_t features;
|
|
uint32_t reserved[4];
|
|
struct uv__io_sqring_offsets sq_off; /* 40 bytes */
|
|
struct uv__io_cqring_offsets cq_off; /* 40 bytes */
|
|
};
|
|
|
|
STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
|
|
STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
|
|
STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
|
|
|
|
STATIC_ASSERT(EPOLL_CTL_ADD < 4);
|
|
STATIC_ASSERT(EPOLL_CTL_DEL < 4);
|
|
STATIC_ASSERT(EPOLL_CTL_MOD < 4);
|
|
|
|
struct watcher_list {
|
|
RB_ENTRY(watcher_list) entry;
|
|
struct uv__queue watchers;
|
|
int iterating;
|
|
char* path;
|
|
int wd;
|
|
};
|
|
|
|
struct watcher_root {
|
|
struct watcher_list* rbh_root;
|
|
};
|
|
|
|
static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
|
|
static void uv__inotify_read(uv_loop_t* loop,
|
|
uv__io_t* w,
|
|
unsigned int revents);
|
|
static int compare_watchers(const struct watcher_list* a,
|
|
const struct watcher_list* b);
|
|
static void maybe_free_watcher_list(struct watcher_list* w,
|
|
uv_loop_t* loop);
|
|
|
|
static void uv__epoll_ctl_flush(int epollfd,
|
|
struct uv__iou* ctl,
|
|
struct epoll_event (*events)[256]);
|
|
|
|
static void uv__epoll_ctl_prep(int epollfd,
|
|
struct uv__iou* ctl,
|
|
struct epoll_event (*events)[256],
|
|
int op,
|
|
int fd,
|
|
struct epoll_event* e);
|
|
|
|
RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
|
|
|
|
|
|
static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
|
|
/* This cast works because watcher_root is a struct with a pointer as its
|
|
* sole member. Such type punning is unsafe in the presence of strict
|
|
* pointer aliasing (and is just plain nasty) but that is why libuv
|
|
* is compiled with -fno-strict-aliasing.
|
|
*/
|
|
return (struct watcher_root*) &loop->inotify_watchers;
|
|
}
|
|
|
|
|
|
unsigned uv__kernel_version(void) {
|
|
static _Atomic unsigned cached_version;
|
|
struct utsname u;
|
|
unsigned version;
|
|
unsigned major;
|
|
unsigned minor;
|
|
unsigned patch;
|
|
|
|
version = atomic_load_explicit(&cached_version, memory_order_relaxed);
|
|
if (version != 0)
|
|
return version;
|
|
|
|
if (-1 == uname(&u))
|
|
return 0;
|
|
|
|
if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
|
|
return 0;
|
|
|
|
version = major * 65536 + minor * 256 + patch;
|
|
atomic_store_explicit(&cached_version, version, memory_order_relaxed);
|
|
|
|
return version;
|
|
}
|
|
|
|
|
|
ssize_t
|
|
uv__fs_copy_file_range(int fd_in,
|
|
off_t* off_in,
|
|
int fd_out,
|
|
off_t* off_out,
|
|
size_t len,
|
|
unsigned int flags)
|
|
{
|
|
#ifdef __NR_copy_file_range
|
|
return syscall(__NR_copy_file_range,
|
|
fd_in,
|
|
off_in,
|
|
fd_out,
|
|
off_out,
|
|
len,
|
|
flags);
|
|
#else
|
|
return errno = ENOSYS, -1;
|
|
#endif
|
|
}
|
|
|
|
|
|
int uv__statx(int dirfd,
|
|
const char* path,
|
|
int flags,
|
|
unsigned int mask,
|
|
struct uv__statx* statxbuf) {
|
|
#if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
|
|
return errno = ENOSYS, -1;
|
|
#else
|
|
int rc;
|
|
|
|
rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
|
|
if (rc >= 0)
|
|
uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
|
|
|
|
return rc;
|
|
#endif
|
|
}
|
|
|
|
|
|
ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
|
|
#if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
|
|
return errno = ENOSYS, -1;
|
|
#else
|
|
ssize_t rc;
|
|
|
|
rc = syscall(__NR_getrandom, buf, buflen, flags);
|
|
if (rc >= 0)
|
|
uv__msan_unpoison(buf, buflen);
|
|
|
|
return rc;
|
|
#endif
|
|
}
|
|
|
|
|
|
int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
|
|
return syscall(__NR_io_uring_setup, entries, params);
|
|
}
|
|
|
|
|
|
int uv__io_uring_enter(int fd,
|
|
unsigned to_submit,
|
|
unsigned min_complete,
|
|
unsigned flags) {
|
|
/* io_uring_enter used to take a sigset_t but it's unused
|
|
* in newer kernels unless IORING_ENTER_EXT_ARG is set,
|
|
* in which case it takes a struct io_uring_getevents_arg.
|
|
*/
|
|
return syscall(__NR_io_uring_enter,
|
|
fd,
|
|
to_submit,
|
|
min_complete,
|
|
flags,
|
|
NULL,
|
|
0L);
|
|
}
|
|
|
|
|
|
int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
|
|
return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
|
|
}
|
|
|
|
|
|
static int uv__use_io_uring(void) {
|
|
#if defined(__ANDROID_API__)
|
|
return 0; /* Possibly available but blocked by seccomp. */
|
|
#else
|
|
/* Ternary: unknown=0, yes=1, no=-1 */
|
|
static _Atomic int use_io_uring;
|
|
char* val;
|
|
int use;
|
|
|
|
use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
|
|
|
|
if (use == 0) {
|
|
val = getenv("UV_USE_IO_URING");
|
|
use = val == NULL || atoi(val) ? 1 : -1;
|
|
atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
|
|
}
|
|
|
|
return use > 0;
|
|
#endif
|
|
}
|
|
|
|
|
|
static void uv__iou_init(int epollfd,
|
|
struct uv__iou* iou,
|
|
uint32_t entries,
|
|
uint32_t flags) {
|
|
struct uv__io_uring_params params;
|
|
struct epoll_event e;
|
|
size_t cqlen;
|
|
size_t sqlen;
|
|
size_t maxlen;
|
|
size_t sqelen;
|
|
uint32_t i;
|
|
char* sq;
|
|
char* sqe;
|
|
int ringfd;
|
|
|
|
sq = MAP_FAILED;
|
|
sqe = MAP_FAILED;
|
|
|
|
if (!uv__use_io_uring())
|
|
return;
|
|
|
|
/* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
|
|
* Mostly academic because we check for a v5.13 kernel afterwards anyway.
|
|
*/
|
|
memset(¶ms, 0, sizeof(params));
|
|
params.flags = flags;
|
|
|
|
if (flags & UV__IORING_SETUP_SQPOLL)
|
|
params.sq_thread_idle = 10; /* milliseconds */
|
|
|
|
/* Kernel returns a file descriptor with O_CLOEXEC flag set. */
|
|
ringfd = uv__io_uring_setup(entries, ¶ms);
|
|
if (ringfd == -1)
|
|
return;
|
|
|
|
/* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
|
|
* actually detecting is whether IORING_OP_STATX works with SQPOLL.
|
|
*/
|
|
if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
|
|
goto fail;
|
|
|
|
/* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
|
|
if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
|
|
goto fail;
|
|
|
|
/* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
|
|
if (!(params.features & UV__IORING_FEAT_NODROP))
|
|
goto fail;
|
|
|
|
sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
|
|
cqlen =
|
|
params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
|
|
maxlen = sqlen < cqlen ? cqlen : sqlen;
|
|
sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
|
|
|
|
sq = mmap(0,
|
|
maxlen,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_SHARED | MAP_POPULATE,
|
|
ringfd,
|
|
0); /* IORING_OFF_SQ_RING */
|
|
|
|
sqe = mmap(0,
|
|
sqelen,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_SHARED | MAP_POPULATE,
|
|
ringfd,
|
|
0x10000000ull); /* IORING_OFF_SQES */
|
|
|
|
if (sq == MAP_FAILED || sqe == MAP_FAILED)
|
|
goto fail;
|
|
|
|
if (flags & UV__IORING_SETUP_SQPOLL) {
|
|
/* Only interested in completion events. To get notified when
|
|
* the kernel pulls items from the submission ring, add POLLOUT.
|
|
*/
|
|
memset(&e, 0, sizeof(e));
|
|
e.events = POLLIN;
|
|
e.data.fd = ringfd;
|
|
|
|
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
|
|
goto fail;
|
|
}
|
|
|
|
iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
|
|
iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
|
|
iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
|
|
iou->sqarray = (uint32_t*) (sq + params.sq_off.array);
|
|
iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
|
|
iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
|
|
iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
|
|
iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
|
|
iou->sq = sq;
|
|
iou->cqe = sq + params.cq_off.cqes;
|
|
iou->sqe = sqe;
|
|
iou->sqlen = sqlen;
|
|
iou->cqlen = cqlen;
|
|
iou->maxlen = maxlen;
|
|
iou->sqelen = sqelen;
|
|
iou->ringfd = ringfd;
|
|
iou->in_flight = 0;
|
|
iou->flags = 0;
|
|
|
|
if (uv__kernel_version() >= /* 5.15.0 */ 0x050F00)
|
|
iou->flags |= UV__MKDIRAT_SYMLINKAT_LINKAT;
|
|
|
|
for (i = 0; i <= iou->sqmask; i++)
|
|
iou->sqarray[i] = i; /* Slot -> sqe identity mapping. */
|
|
|
|
return;
|
|
|
|
fail:
|
|
if (sq != MAP_FAILED)
|
|
munmap(sq, maxlen);
|
|
|
|
if (sqe != MAP_FAILED)
|
|
munmap(sqe, sqelen);
|
|
|
|
uv__close(ringfd);
|
|
}
|
|
|
|
|
|
static void uv__iou_delete(struct uv__iou* iou) {
|
|
if (iou->ringfd != -1) {
|
|
munmap(iou->sq, iou->maxlen);
|
|
munmap(iou->sqe, iou->sqelen);
|
|
uv__close(iou->ringfd);
|
|
iou->ringfd = -1;
|
|
}
|
|
}
|
|
|
|
|
|
int uv__platform_loop_init(uv_loop_t* loop) {
|
|
uv__loop_internal_fields_t* lfields;
|
|
|
|
lfields = uv__get_internal_fields(loop);
|
|
lfields->ctl.ringfd = -1;
|
|
lfields->iou.ringfd = -1;
|
|
|
|
loop->inotify_watchers = NULL;
|
|
loop->inotify_fd = -1;
|
|
loop->backend_fd = epoll_create1(O_CLOEXEC);
|
|
|
|
if (loop->backend_fd == -1)
|
|
return UV__ERR(errno);
|
|
|
|
uv__iou_init(loop->backend_fd, &lfields->iou, 64, UV__IORING_SETUP_SQPOLL);
|
|
uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int uv__io_fork(uv_loop_t* loop) {
|
|
int err;
|
|
struct watcher_list* root;
|
|
|
|
root = uv__inotify_watchers(loop)->rbh_root;
|
|
|
|
uv__close(loop->backend_fd);
|
|
loop->backend_fd = -1;
|
|
|
|
/* TODO(bnoordhuis) Loses items from the submission and completion rings. */
|
|
uv__platform_loop_delete(loop);
|
|
|
|
err = uv__platform_loop_init(loop);
|
|
if (err)
|
|
return err;
|
|
|
|
return uv__inotify_fork(loop, root);
|
|
}
|
|
|
|
|
|
void uv__platform_loop_delete(uv_loop_t* loop) {
|
|
uv__loop_internal_fields_t* lfields;
|
|
|
|
lfields = uv__get_internal_fields(loop);
|
|
uv__iou_delete(&lfields->ctl);
|
|
uv__iou_delete(&lfields->iou);
|
|
|
|
if (loop->inotify_fd != -1) {
|
|
uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
|
|
uv__close(loop->inotify_fd);
|
|
loop->inotify_fd = -1;
|
|
}
|
|
}
|
|
|
|
|
|
struct uv__invalidate {
|
|
struct epoll_event (*prep)[256];
|
|
struct epoll_event* events;
|
|
int nfds;
|
|
};
|
|
|
|
|
|
void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
|
|
uv__loop_internal_fields_t* lfields;
|
|
struct uv__invalidate* inv;
|
|
struct epoll_event dummy;
|
|
int i;
|
|
|
|
lfields = uv__get_internal_fields(loop);
|
|
inv = lfields->inv;
|
|
|
|
/* Invalidate events with same file descriptor */
|
|
if (inv != NULL)
|
|
for (i = 0; i < inv->nfds; i++)
|
|
if (inv->events[i].data.fd == fd)
|
|
inv->events[i].data.fd = -1;
|
|
|
|
/* Remove the file descriptor from the epoll.
|
|
* This avoids a problem where the same file description remains open
|
|
* in another process, causing repeated junk epoll events.
|
|
*
|
|
* We pass in a dummy epoll_event, to work around a bug in old kernels.
|
|
*
|
|
* Work around a bug in kernels 3.10 to 3.19 where passing a struct that
|
|
* has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
|
|
*/
|
|
memset(&dummy, 0, sizeof(dummy));
|
|
|
|
if (inv == NULL) {
|
|
epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
|
|
} else {
|
|
uv__epoll_ctl_prep(loop->backend_fd,
|
|
&lfields->ctl,
|
|
inv->prep,
|
|
EPOLL_CTL_DEL,
|
|
fd,
|
|
&dummy);
|
|
}
|
|
}
|
|
|
|
|
|
int uv__io_check_fd(uv_loop_t* loop, int fd) {
|
|
struct epoll_event e;
|
|
int rc;
|
|
|
|
memset(&e, 0, sizeof(e));
|
|
e.events = POLLIN;
|
|
e.data.fd = -1;
|
|
|
|
rc = 0;
|
|
if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
|
|
if (errno != EEXIST)
|
|
rc = UV__ERR(errno);
|
|
|
|
if (rc == 0)
|
|
if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
|
|
abort();
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/* Caller must initialize SQE and call uv__iou_submit(). */
|
|
static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
|
|
uv_loop_t* loop,
|
|
uv_fs_t* req) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
uint32_t head;
|
|
uint32_t tail;
|
|
uint32_t mask;
|
|
uint32_t slot;
|
|
|
|
if (iou->ringfd == -1)
|
|
return NULL;
|
|
|
|
head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
|
|
memory_order_acquire);
|
|
tail = *iou->sqtail;
|
|
mask = iou->sqmask;
|
|
|
|
if ((head & mask) == ((tail + 1) & mask))
|
|
return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
|
|
|
|
slot = tail & mask;
|
|
sqe = iou->sqe;
|
|
sqe = &sqe[slot];
|
|
memset(sqe, 0, sizeof(*sqe));
|
|
sqe->user_data = (uintptr_t) req;
|
|
|
|
/* Pacify uv_cancel(). */
|
|
req->work_req.loop = loop;
|
|
req->work_req.work = NULL;
|
|
req->work_req.done = NULL;
|
|
uv__queue_init(&req->work_req.wq);
|
|
|
|
uv__req_register(loop, req);
|
|
iou->in_flight++;
|
|
|
|
return sqe;
|
|
}
|
|
|
|
|
|
static void uv__iou_submit(struct uv__iou* iou) {
|
|
uint32_t flags;
|
|
|
|
atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
|
|
*iou->sqtail + 1,
|
|
memory_order_release);
|
|
|
|
flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
|
|
memory_order_acquire);
|
|
|
|
if (flags & UV__IORING_SQ_NEED_WAKEUP)
|
|
if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
|
|
if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */
|
|
perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */
|
|
}
|
|
|
|
|
|
int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
/* Work around a poorly understood bug in older kernels where closing a file
|
|
* descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
|
|
* execve("/foo/bar") later on. The bug seems to have been fixed somewhere
|
|
* between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
|
|
* but good candidates are the several data race fixes. Interestingly, it
|
|
* seems to manifest only when running under Docker so the possibility of
|
|
* a Docker bug can't be completely ruled out either. Yay, computers.
|
|
*/
|
|
if (uv__kernel_version() < /* 5.15.90 */ 0x050F5A)
|
|
return 0;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
sqe->fd = req->file;
|
|
sqe->opcode = UV__IORING_OP_CLOSE;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
|
|
uv_fs_t* req,
|
|
uint32_t fsync_flags) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
/* Little known fact: setting seq->off and seq->len turns
|
|
* it into an asynchronous sync_file_range() operation.
|
|
*/
|
|
sqe->fd = req->file;
|
|
sqe->fsync_flags = fsync_flags;
|
|
sqe->opcode = UV__IORING_OP_FSYNC;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
|
|
return 0;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
sqe->addr = (uintptr_t) req->path;
|
|
sqe->fd = AT_FDCWD;
|
|
sqe->addr2 = (uintptr_t) req->new_path;
|
|
sqe->len = AT_FDCWD;
|
|
sqe->opcode = UV__IORING_OP_LINKAT;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
|
|
return 0;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
sqe->addr = (uintptr_t) req->path;
|
|
sqe->fd = AT_FDCWD;
|
|
sqe->len = req->mode;
|
|
sqe->opcode = UV__IORING_OP_MKDIRAT;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
sqe->addr = (uintptr_t) req->path;
|
|
sqe->fd = AT_FDCWD;
|
|
sqe->len = req->mode;
|
|
sqe->opcode = UV__IORING_OP_OPENAT;
|
|
sqe->open_flags = req->flags | O_CLOEXEC;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
sqe->addr = (uintptr_t) req->path;
|
|
sqe->fd = AT_FDCWD;
|
|
sqe->addr2 = (uintptr_t) req->new_path;
|
|
sqe->len = AT_FDCWD;
|
|
sqe->opcode = UV__IORING_OP_RENAMEAT;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
|
|
return 0;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
sqe->addr = (uintptr_t) req->path;
|
|
sqe->fd = AT_FDCWD;
|
|
sqe->addr2 = (uintptr_t) req->new_path;
|
|
sqe->opcode = UV__IORING_OP_SYMLINKAT;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
sqe->addr = (uintptr_t) req->path;
|
|
sqe->fd = AT_FDCWD;
|
|
sqe->opcode = UV__IORING_OP_UNLINKAT;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_read_or_write(uv_loop_t* loop,
|
|
uv_fs_t* req,
|
|
int is_read) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__iou* iou;
|
|
|
|
/* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
|
|
* to the threadpool on writes */
|
|
if (req->nbufs > IOV_MAX) {
|
|
if (is_read)
|
|
req->nbufs = IOV_MAX;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL)
|
|
return 0;
|
|
|
|
sqe->addr = (uintptr_t) req->bufs;
|
|
sqe->fd = req->file;
|
|
sqe->len = req->nbufs;
|
|
sqe->off = req->off < 0 ? -1 : req->off;
|
|
sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
int uv__iou_fs_statx(uv_loop_t* loop,
|
|
uv_fs_t* req,
|
|
int is_fstat,
|
|
int is_lstat) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct uv__statx* statxbuf;
|
|
struct uv__iou* iou;
|
|
|
|
statxbuf = uv__malloc(sizeof(*statxbuf));
|
|
if (statxbuf == NULL)
|
|
return 0;
|
|
|
|
iou = &uv__get_internal_fields(loop)->iou;
|
|
|
|
sqe = uv__iou_get_sqe(iou, loop, req);
|
|
if (sqe == NULL) {
|
|
uv__free(statxbuf);
|
|
return 0;
|
|
}
|
|
|
|
req->ptr = statxbuf;
|
|
|
|
sqe->addr = (uintptr_t) req->path;
|
|
sqe->addr2 = (uintptr_t) statxbuf;
|
|
sqe->fd = AT_FDCWD;
|
|
sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
|
|
sqe->opcode = UV__IORING_OP_STATX;
|
|
|
|
if (is_fstat) {
|
|
sqe->addr = (uintptr_t) "";
|
|
sqe->fd = req->file;
|
|
sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
|
|
}
|
|
|
|
if (is_lstat)
|
|
sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
|
|
|
|
uv__iou_submit(iou);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
|
|
buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
|
|
buf->st_mode = statxbuf->stx_mode;
|
|
buf->st_nlink = statxbuf->stx_nlink;
|
|
buf->st_uid = statxbuf->stx_uid;
|
|
buf->st_gid = statxbuf->stx_gid;
|
|
buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
|
|
buf->st_ino = statxbuf->stx_ino;
|
|
buf->st_size = statxbuf->stx_size;
|
|
buf->st_blksize = statxbuf->stx_blksize;
|
|
buf->st_blocks = statxbuf->stx_blocks;
|
|
buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
|
|
buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
|
|
buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
|
|
buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
|
|
buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
|
|
buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
|
|
buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
|
|
buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
|
|
buf->st_flags = 0;
|
|
buf->st_gen = 0;
|
|
}
|
|
|
|
|
|
static void uv__iou_fs_statx_post(uv_fs_t* req) {
|
|
struct uv__statx* statxbuf;
|
|
uv_stat_t* buf;
|
|
|
|
buf = &req->statbuf;
|
|
statxbuf = req->ptr;
|
|
req->ptr = NULL;
|
|
|
|
if (req->result == 0) {
|
|
uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
|
|
uv__statx_to_stat(statxbuf, buf);
|
|
req->ptr = buf;
|
|
}
|
|
|
|
uv__free(statxbuf);
|
|
}
|
|
|
|
|
|
static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
|
|
struct uv__io_uring_cqe* cqe;
|
|
struct uv__io_uring_cqe* e;
|
|
uv_fs_t* req;
|
|
uint32_t head;
|
|
uint32_t tail;
|
|
uint32_t mask;
|
|
uint32_t i;
|
|
uint32_t flags;
|
|
int nevents;
|
|
int rc;
|
|
|
|
head = *iou->cqhead;
|
|
tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
|
|
memory_order_acquire);
|
|
mask = iou->cqmask;
|
|
cqe = iou->cqe;
|
|
nevents = 0;
|
|
|
|
for (i = head; i != tail; i++) {
|
|
e = &cqe[i & mask];
|
|
|
|
req = (uv_fs_t*) (uintptr_t) e->user_data;
|
|
assert(req->type == UV_FS);
|
|
|
|
uv__req_unregister(loop, req);
|
|
iou->in_flight--;
|
|
|
|
/* io_uring stores error codes as negative numbers, same as libuv. */
|
|
req->result = e->res;
|
|
|
|
switch (req->fs_type) {
|
|
case UV_FS_FSTAT:
|
|
case UV_FS_LSTAT:
|
|
case UV_FS_STAT:
|
|
uv__iou_fs_statx_post(req);
|
|
break;
|
|
default: /* Squelch -Wswitch warnings. */
|
|
break;
|
|
}
|
|
|
|
uv__metrics_update_idle_time(loop);
|
|
req->cb(req);
|
|
nevents++;
|
|
}
|
|
|
|
atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
|
|
tail,
|
|
memory_order_release);
|
|
|
|
/* Check whether CQE's overflowed, if so enter the kernel to make them
|
|
* available. Don't grab them immediately but in the next loop iteration to
|
|
* avoid loop starvation. */
|
|
flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
|
|
memory_order_acquire);
|
|
|
|
if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
|
|
do
|
|
rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
|
|
while (rc == -1 && errno == EINTR);
|
|
|
|
if (rc < 0)
|
|
perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
|
|
}
|
|
|
|
uv__metrics_inc_events(loop, nevents);
|
|
if (uv__get_internal_fields(loop)->current_timeout == 0)
|
|
uv__metrics_inc_events_waiting(loop, nevents);
|
|
}
|
|
|
|
|
|
static void uv__epoll_ctl_prep(int epollfd,
|
|
struct uv__iou* ctl,
|
|
struct epoll_event (*events)[256],
|
|
int op,
|
|
int fd,
|
|
struct epoll_event* e) {
|
|
struct uv__io_uring_sqe* sqe;
|
|
struct epoll_event* pe;
|
|
uint32_t mask;
|
|
uint32_t slot;
|
|
|
|
if (ctl->ringfd == -1) {
|
|
if (!epoll_ctl(epollfd, op, fd, e))
|
|
return;
|
|
|
|
if (op == EPOLL_CTL_DEL)
|
|
return; /* Ignore errors, may be racing with another thread. */
|
|
|
|
if (op != EPOLL_CTL_ADD)
|
|
abort();
|
|
|
|
if (errno != EEXIST)
|
|
abort();
|
|
|
|
/* File descriptor that's been watched before, update event mask. */
|
|
if (!epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, e))
|
|
return;
|
|
|
|
abort();
|
|
} else {
|
|
mask = ctl->sqmask;
|
|
slot = (*ctl->sqtail)++ & mask;
|
|
|
|
pe = &(*events)[slot];
|
|
*pe = *e;
|
|
|
|
sqe = ctl->sqe;
|
|
sqe = &sqe[slot];
|
|
|
|
memset(sqe, 0, sizeof(*sqe));
|
|
sqe->addr = (uintptr_t) pe;
|
|
sqe->fd = epollfd;
|
|
sqe->len = op;
|
|
sqe->off = fd;
|
|
sqe->opcode = UV__IORING_OP_EPOLL_CTL;
|
|
sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
|
|
|
|
if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
|
|
uv__epoll_ctl_flush(epollfd, ctl, events);
|
|
}
|
|
}
|
|
|
|
|
|
static void uv__epoll_ctl_flush(int epollfd,
|
|
struct uv__iou* ctl,
|
|
struct epoll_event (*events)[256]) {
|
|
struct epoll_event oldevents[256];
|
|
struct uv__io_uring_cqe* cqe;
|
|
uint32_t oldslot;
|
|
uint32_t slot;
|
|
uint32_t n;
|
|
int fd;
|
|
int op;
|
|
int rc;
|
|
|
|
STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
|
|
assert(ctl->ringfd != -1);
|
|
assert(*ctl->sqhead != *ctl->sqtail);
|
|
|
|
n = *ctl->sqtail - *ctl->sqhead;
|
|
do
|
|
rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
|
|
while (rc == -1 && errno == EINTR);
|
|
|
|
if (rc < 0)
|
|
perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
|
|
|
|
if (rc != (int) n)
|
|
abort();
|
|
|
|
assert(*ctl->sqhead == *ctl->sqtail);
|
|
|
|
memcpy(oldevents, *events, sizeof(*events));
|
|
|
|
/* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
|
|
* that have been closed, or EPOLL_CTL_ADD commands for file descriptors
|
|
* that we are already watching. Ignore the former and retry the latter
|
|
* with EPOLL_CTL_MOD.
|
|
*/
|
|
while (*ctl->cqhead != *ctl->cqtail) {
|
|
slot = (*ctl->cqhead)++ & ctl->cqmask;
|
|
|
|
cqe = ctl->cqe;
|
|
cqe = &cqe[slot];
|
|
|
|
if (cqe->res == 0)
|
|
continue;
|
|
|
|
fd = cqe->user_data >> 32;
|
|
op = 3 & cqe->user_data;
|
|
oldslot = 255 & (cqe->user_data >> 2);
|
|
|
|
if (op == EPOLL_CTL_DEL)
|
|
continue;
|
|
|
|
if (op != EPOLL_CTL_ADD)
|
|
abort();
|
|
|
|
if (cqe->res != -EEXIST)
|
|
abort();
|
|
|
|
uv__epoll_ctl_prep(epollfd,
|
|
ctl,
|
|
events,
|
|
EPOLL_CTL_MOD,
|
|
fd,
|
|
&oldevents[oldslot]);
|
|
}
|
|
}
|
|
|
|
|
|
void uv__io_poll(uv_loop_t* loop, int timeout) {
|
|
uv__loop_internal_fields_t* lfields;
|
|
struct epoll_event events[1024];
|
|
struct epoll_event prep[256];
|
|
struct uv__invalidate inv;
|
|
struct epoll_event* pe;
|
|
struct epoll_event e;
|
|
struct uv__iou* ctl;
|
|
struct uv__iou* iou;
|
|
int real_timeout;
|
|
struct uv__queue* q;
|
|
uv__io_t* w;
|
|
sigset_t* sigmask;
|
|
sigset_t sigset;
|
|
uint64_t base;
|
|
int have_iou_events;
|
|
int have_signals;
|
|
int nevents;
|
|
int epollfd;
|
|
int count;
|
|
int nfds;
|
|
int fd;
|
|
int op;
|
|
int i;
|
|
int user_timeout;
|
|
int reset_timeout;
|
|
|
|
lfields = uv__get_internal_fields(loop);
|
|
ctl = &lfields->ctl;
|
|
iou = &lfields->iou;
|
|
|
|
sigmask = NULL;
|
|
if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
|
|
sigemptyset(&sigset);
|
|
sigaddset(&sigset, SIGPROF);
|
|
sigmask = &sigset;
|
|
}
|
|
|
|
assert(timeout >= -1);
|
|
base = loop->time;
|
|
count = 48; /* Benchmarks suggest this gives the best throughput. */
|
|
real_timeout = timeout;
|
|
|
|
if (lfields->flags & UV_METRICS_IDLE_TIME) {
|
|
reset_timeout = 1;
|
|
user_timeout = timeout;
|
|
timeout = 0;
|
|
} else {
|
|
reset_timeout = 0;
|
|
user_timeout = 0;
|
|
}
|
|
|
|
epollfd = loop->backend_fd;
|
|
|
|
memset(&e, 0, sizeof(e));
|
|
|
|
while (!uv__queue_empty(&loop->watcher_queue)) {
|
|
q = uv__queue_head(&loop->watcher_queue);
|
|
w = uv__queue_data(q, uv__io_t, watcher_queue);
|
|
uv__queue_remove(q);
|
|
uv__queue_init(q);
|
|
|
|
op = EPOLL_CTL_MOD;
|
|
if (w->events == 0)
|
|
op = EPOLL_CTL_ADD;
|
|
|
|
w->events = w->pevents;
|
|
e.events = w->pevents;
|
|
e.data.fd = w->fd;
|
|
|
|
uv__epoll_ctl_prep(epollfd, ctl, &prep, op, w->fd, &e);
|
|
}
|
|
|
|
inv.events = events;
|
|
inv.prep = &prep;
|
|
inv.nfds = -1;
|
|
|
|
for (;;) {
|
|
if (loop->nfds == 0)
|
|
if (iou->in_flight == 0)
|
|
break;
|
|
|
|
/* All event mask mutations should be visible to the kernel before
|
|
* we enter epoll_pwait().
|
|
*/
|
|
if (ctl->ringfd != -1)
|
|
while (*ctl->sqhead != *ctl->sqtail)
|
|
uv__epoll_ctl_flush(epollfd, ctl, &prep);
|
|
|
|
/* Only need to set the provider_entry_time if timeout != 0. The function
|
|
* will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
|
|
*/
|
|
if (timeout != 0)
|
|
uv__metrics_set_provider_entry_time(loop);
|
|
|
|
/* Store the current timeout in a location that's globally accessible so
|
|
* other locations like uv__work_done() can determine whether the queue
|
|
* of events in the callback were waiting when poll was called.
|
|
*/
|
|
lfields->current_timeout = timeout;
|
|
|
|
nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
|
|
|
|
/* Update loop->time unconditionally. It's tempting to skip the update when
|
|
* timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
|
|
* operating system didn't reschedule our process while in the syscall.
|
|
*/
|
|
SAVE_ERRNO(uv__update_time(loop));
|
|
|
|
if (nfds == 0) {
|
|
assert(timeout != -1);
|
|
|
|
if (reset_timeout != 0) {
|
|
timeout = user_timeout;
|
|
reset_timeout = 0;
|
|
}
|
|
|
|
if (timeout == -1)
|
|
continue;
|
|
|
|
if (timeout == 0)
|
|
break;
|
|
|
|
/* We may have been inside the system call for longer than |timeout|
|
|
* milliseconds so we need to update the timestamp to avoid drift.
|
|
*/
|
|
goto update_timeout;
|
|
}
|
|
|
|
if (nfds == -1) {
|
|
if (errno != EINTR)
|
|
abort();
|
|
|
|
if (reset_timeout != 0) {
|
|
timeout = user_timeout;
|
|
reset_timeout = 0;
|
|
}
|
|
|
|
if (timeout == -1)
|
|
continue;
|
|
|
|
if (timeout == 0)
|
|
break;
|
|
|
|
/* Interrupted by a signal. Update timeout and poll again. */
|
|
goto update_timeout;
|
|
}
|
|
|
|
have_iou_events = 0;
|
|
have_signals = 0;
|
|
nevents = 0;
|
|
|
|
inv.nfds = nfds;
|
|
lfields->inv = &inv;
|
|
|
|
for (i = 0; i < nfds; i++) {
|
|
pe = events + i;
|
|
fd = pe->data.fd;
|
|
|
|
/* Skip invalidated events, see uv__platform_invalidate_fd */
|
|
if (fd == -1)
|
|
continue;
|
|
|
|
if (fd == iou->ringfd) {
|
|
uv__poll_io_uring(loop, iou);
|
|
have_iou_events = 1;
|
|
continue;
|
|
}
|
|
|
|
assert(fd >= 0);
|
|
assert((unsigned) fd < loop->nwatchers);
|
|
|
|
w = loop->watchers[fd];
|
|
|
|
if (w == NULL) {
|
|
/* File descriptor that we've stopped watching, disarm it.
|
|
*
|
|
* Ignore all errors because we may be racing with another thread
|
|
* when the file descriptor is closed.
|
|
*/
|
|
uv__epoll_ctl_prep(epollfd, ctl, &prep, EPOLL_CTL_DEL, fd, pe);
|
|
continue;
|
|
}
|
|
|
|
/* Give users only events they're interested in. Prevents spurious
|
|
* callbacks when previous callback invocation in this loop has stopped
|
|
* the current watcher. Also, filters out events that users has not
|
|
* requested us to watch.
|
|
*/
|
|
pe->events &= w->pevents | POLLERR | POLLHUP;
|
|
|
|
/* Work around an epoll quirk where it sometimes reports just the
|
|
* EPOLLERR or EPOLLHUP event. In order to force the event loop to
|
|
* move forward, we merge in the read/write events that the watcher
|
|
* is interested in; uv__read() and uv__write() will then deal with
|
|
* the error or hangup in the usual fashion.
|
|
*
|
|
* Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
|
|
* reads the available data, calls uv_read_stop(), then sometime later
|
|
* calls uv_read_start() again. By then, libuv has forgotten about the
|
|
* hangup and the kernel won't report EPOLLIN again because there's
|
|
* nothing left to read. If anything, libuv is to blame here. The
|
|
* current hack is just a quick bandaid; to properly fix it, libuv
|
|
* needs to remember the error/hangup event. We should get that for
|
|
* free when we switch over to edge-triggered I/O.
|
|
*/
|
|
if (pe->events == POLLERR || pe->events == POLLHUP)
|
|
pe->events |=
|
|
w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
|
|
|
|
if (pe->events != 0) {
|
|
/* Run signal watchers last. This also affects child process watchers
|
|
* because those are implemented in terms of signal watchers.
|
|
*/
|
|
if (w == &loop->signal_io_watcher) {
|
|
have_signals = 1;
|
|
} else {
|
|
uv__metrics_update_idle_time(loop);
|
|
w->cb(loop, w, pe->events);
|
|
}
|
|
|
|
nevents++;
|
|
}
|
|
}
|
|
|
|
uv__metrics_inc_events(loop, nevents);
|
|
if (reset_timeout != 0) {
|
|
timeout = user_timeout;
|
|
reset_timeout = 0;
|
|
uv__metrics_inc_events_waiting(loop, nevents);
|
|
}
|
|
|
|
if (have_signals != 0) {
|
|
uv__metrics_update_idle_time(loop);
|
|
loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
|
|
}
|
|
|
|
lfields->inv = NULL;
|
|
|
|
if (have_iou_events != 0)
|
|
break; /* Event loop should cycle now so don't poll again. */
|
|
|
|
if (have_signals != 0)
|
|
break; /* Event loop should cycle now so don't poll again. */
|
|
|
|
if (nevents != 0) {
|
|
if (nfds == ARRAY_SIZE(events) && --count != 0) {
|
|
/* Poll for more events but don't block this time. */
|
|
timeout = 0;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (timeout == 0)
|
|
break;
|
|
|
|
if (timeout == -1)
|
|
continue;
|
|
|
|
update_timeout:
|
|
assert(timeout > 0);
|
|
|
|
real_timeout -= (loop->time - base);
|
|
if (real_timeout <= 0)
|
|
break;
|
|
|
|
timeout = real_timeout;
|
|
}
|
|
|
|
if (ctl->ringfd != -1)
|
|
while (*ctl->sqhead != *ctl->sqtail)
|
|
uv__epoll_ctl_flush(epollfd, ctl, &prep);
|
|
}
|
|
|
|
uint64_t uv__hrtime(uv_clocktype_t type) {
|
|
static _Atomic clock_t fast_clock_id = -1;
|
|
struct timespec t;
|
|
clock_t clock_id;
|
|
|
|
/* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
|
|
* millisecond granularity or better. CLOCK_MONOTONIC_COARSE is
|
|
* serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
|
|
* decide to make a costly system call.
|
|
*/
|
|
/* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
|
|
* when it has microsecond granularity or better (unlikely).
|
|
*/
|
|
clock_id = CLOCK_MONOTONIC;
|
|
if (type != UV_CLOCK_FAST)
|
|
goto done;
|
|
|
|
clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
|
|
if (clock_id != -1)
|
|
goto done;
|
|
|
|
clock_id = CLOCK_MONOTONIC;
|
|
if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
|
|
if (t.tv_nsec <= 1 * 1000 * 1000)
|
|
clock_id = CLOCK_MONOTONIC_COARSE;
|
|
|
|
atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
|
|
|
|
done:
|
|
|
|
if (clock_gettime(clock_id, &t))
|
|
return 0; /* Not really possible. */
|
|
|
|
return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
|
|
}
|
|
|
|
|
|
int uv_resident_set_memory(size_t* rss) {
|
|
char buf[1024];
|
|
const char* s;
|
|
ssize_t n;
|
|
long val;
|
|
int fd;
|
|
int i;
|
|
|
|
do
|
|
fd = open("/proc/self/stat", O_RDONLY);
|
|
while (fd == -1 && errno == EINTR);
|
|
|
|
if (fd == -1)
|
|
return UV__ERR(errno);
|
|
|
|
do
|
|
n = read(fd, buf, sizeof(buf) - 1);
|
|
while (n == -1 && errno == EINTR);
|
|
|
|
uv__close(fd);
|
|
if (n == -1)
|
|
return UV__ERR(errno);
|
|
buf[n] = '\0';
|
|
|
|
s = strchr(buf, ' ');
|
|
if (s == NULL)
|
|
goto err;
|
|
|
|
s += 1;
|
|
if (*s != '(')
|
|
goto err;
|
|
|
|
s = strchr(s, ')');
|
|
if (s == NULL)
|
|
goto err;
|
|
|
|
for (i = 1; i <= 22; i++) {
|
|
s = strchr(s + 1, ' ');
|
|
if (s == NULL)
|
|
goto err;
|
|
}
|
|
|
|
errno = 0;
|
|
val = strtol(s, NULL, 10);
|
|
if (errno != 0)
|
|
goto err;
|
|
if (val < 0)
|
|
goto err;
|
|
|
|
*rss = val * getpagesize();
|
|
return 0;
|
|
|
|
err:
|
|
return UV_EINVAL;
|
|
}
|
|
|
|
int uv_uptime(double* uptime) {
|
|
struct timespec now;
|
|
char buf[128];
|
|
|
|
/* Consult /proc/uptime when present (common case), or fall back to
|
|
* clock_gettime. Why not always clock_gettime? It doesn't always return the
|
|
* right result under OpenVZ and possibly other containerized environments.
|
|
*/
|
|
if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
|
|
if (1 == sscanf(buf, "%lf", uptime))
|
|
return 0;
|
|
|
|
if (clock_gettime(CLOCK_BOOTTIME, &now))
|
|
return UV__ERR(errno);
|
|
|
|
*uptime = now.tv_sec;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
|
|
#if defined(__PPC__)
|
|
static const char model_marker[] = "cpu\t\t: ";
|
|
#elif defined(__arm__)
|
|
static const char model_marker[] = "Processor\t: ";
|
|
#elif defined(__aarch64__)
|
|
static const char model_marker[] = "CPU part\t: ";
|
|
#elif defined(__mips__)
|
|
static const char model_marker[] = "cpu model\t\t: ";
|
|
#elif defined(__loongarch__)
|
|
static const char model_marker[] = "cpu family\t\t: ";
|
|
#else
|
|
static const char model_marker[] = "model name\t: ";
|
|
#endif
|
|
static const char parts[] =
|
|
#ifdef __aarch64__
|
|
"0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n"
|
|
"0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n"
|
|
"0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n"
|
|
"0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
|
|
"0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n"
|
|
"0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n"
|
|
"0xc0d\nCortex-A17\n" /* Originally A12 */
|
|
"0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n"
|
|
"0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n"
|
|
"0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n"
|
|
"0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n"
|
|
"0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n"
|
|
"0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n"
|
|
"0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n"
|
|
"0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n"
|
|
"0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n"
|
|
"0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n"
|
|
"0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n"
|
|
#endif
|
|
"";
|
|
struct cpu {
|
|
unsigned long long freq, user, nice, sys, idle, irq;
|
|
unsigned model;
|
|
};
|
|
FILE* fp;
|
|
char* p;
|
|
int found;
|
|
int n;
|
|
unsigned i;
|
|
unsigned cpu;
|
|
unsigned maxcpu;
|
|
unsigned size;
|
|
unsigned long long skip;
|
|
struct cpu (*cpus)[8192]; /* Kernel maximum. */
|
|
struct cpu* c;
|
|
struct cpu t;
|
|
char (*model)[64];
|
|
unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
|
|
/* Assumption: even big.LITTLE systems will have only a handful
|
|
* of different CPU models. Most systems will just have one.
|
|
*/
|
|
char models[8][64];
|
|
char buf[1024];
|
|
|
|
memset(bitmap, 0, sizeof(bitmap));
|
|
memset(models, 0, sizeof(models));
|
|
snprintf(*models, sizeof(*models), "unknown");
|
|
maxcpu = 0;
|
|
|
|
cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
|
|
if (cpus == NULL)
|
|
return UV_ENOMEM;
|
|
|
|
fp = uv__open_file("/proc/stat");
|
|
if (fp == NULL) {
|
|
uv__free(cpus);
|
|
return UV__ERR(errno);
|
|
}
|
|
|
|
fgets(buf, sizeof(buf), fp); /* Skip first line. */
|
|
|
|
for (;;) {
|
|
memset(&t, 0, sizeof(t));
|
|
|
|
n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
|
|
&cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
|
|
|
|
if (n != 7)
|
|
break;
|
|
|
|
fgets(buf, sizeof(buf), fp); /* Skip rest of line. */
|
|
|
|
if (cpu >= ARRAY_SIZE(*cpus))
|
|
continue;
|
|
|
|
(*cpus)[cpu] = t;
|
|
|
|
bitmap[cpu >> 3] |= 1 << (cpu & 7);
|
|
|
|
if (cpu >= maxcpu)
|
|
maxcpu = cpu + 1;
|
|
}
|
|
|
|
fclose(fp);
|
|
|
|
fp = uv__open_file("/proc/cpuinfo");
|
|
if (fp == NULL)
|
|
goto nocpuinfo;
|
|
|
|
for (;;) {
|
|
if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
|
|
break; /* Parse error. */
|
|
|
|
found = 0;
|
|
while (!found && fgets(buf, sizeof(buf), fp))
|
|
found = !strncmp(buf, model_marker, sizeof(model_marker) - 1);
|
|
|
|
if (!found)
|
|
goto next;
|
|
|
|
p = buf + sizeof(model_marker) - 1;
|
|
n = (int) strcspn(p, "\n");
|
|
|
|
/* arm64: translate CPU part code to model name. */
|
|
if (*parts) {
|
|
p = memmem(parts, sizeof(parts) - 1, p, n + 1);
|
|
if (p == NULL)
|
|
p = "unknown";
|
|
else
|
|
p += n + 1;
|
|
n = (int) strcspn(p, "\n");
|
|
}
|
|
|
|
found = 0;
|
|
for (model = models; !found && model < ARRAY_END(models); model++)
|
|
found = !strncmp(p, *model, strlen(*model));
|
|
|
|
if (!found)
|
|
goto next;
|
|
|
|
if (**model == '\0')
|
|
snprintf(*model, sizeof(*model), "%.*s", n, p);
|
|
|
|
if (cpu < maxcpu)
|
|
(*cpus)[cpu].model = model - models;
|
|
|
|
next:
|
|
while (fgets(buf, sizeof(buf), fp))
|
|
if (*buf == '\n')
|
|
break;
|
|
}
|
|
|
|
fclose(fp);
|
|
fp = NULL;
|
|
|
|
nocpuinfo:
|
|
|
|
n = 0;
|
|
for (cpu = 0; cpu < maxcpu; cpu++) {
|
|
if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
|
|
continue;
|
|
|
|
n++;
|
|
snprintf(buf, sizeof(buf),
|
|
"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
|
|
|
|
fp = uv__open_file(buf);
|
|
if (fp == NULL)
|
|
continue;
|
|
|
|
fscanf(fp, "%llu", &(*cpus)[cpu].freq);
|
|
fclose(fp);
|
|
fp = NULL;
|
|
}
|
|
|
|
size = n * sizeof(**ci) + sizeof(models);
|
|
*ci = uv__malloc(size);
|
|
*count = 0;
|
|
|
|
if (*ci == NULL) {
|
|
uv__free(cpus);
|
|
return UV_ENOMEM;
|
|
}
|
|
|
|
*count = n;
|
|
p = memcpy(*ci + n, models, sizeof(models));
|
|
|
|
i = 0;
|
|
for (cpu = 0; cpu < maxcpu; cpu++) {
|
|
if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
|
|
continue;
|
|
|
|
c = *cpus + cpu;
|
|
|
|
(*ci)[i++] = (uv_cpu_info_t) {
|
|
.model = p + c->model * sizeof(*model),
|
|
.speed = c->freq / 1000,
|
|
/* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
|
|
* therefore the multiplier is always 1000/100 = 10.
|
|
*/
|
|
.cpu_times = (struct uv_cpu_times_s) {
|
|
.user = 10 * c->user,
|
|
.nice = 10 * c->nice,
|
|
.sys = 10 * c->sys,
|
|
.idle = 10 * c->idle,
|
|
.irq = 10 * c->irq,
|
|
},
|
|
};
|
|
}
|
|
|
|
uv__free(cpus);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
#ifdef HAVE_IFADDRS_H
|
|
static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
|
|
if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
|
|
return 1;
|
|
if (ent->ifa_addr == NULL)
|
|
return 1;
|
|
/*
|
|
* On Linux getifaddrs returns information related to the raw underlying
|
|
* devices. We're not interested in this information yet.
|
|
*/
|
|
if (ent->ifa_addr->sa_family == PF_PACKET)
|
|
return exclude_type;
|
|
return !exclude_type;
|
|
}
|
|
#endif
|
|
|
|
int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
|
|
#ifndef HAVE_IFADDRS_H
|
|
*count = 0;
|
|
*addresses = NULL;
|
|
return UV_ENOSYS;
|
|
#else
|
|
struct ifaddrs *addrs, *ent;
|
|
uv_interface_address_t* address;
|
|
int i;
|
|
struct sockaddr_ll *sll;
|
|
|
|
*count = 0;
|
|
*addresses = NULL;
|
|
|
|
if (getifaddrs(&addrs))
|
|
return UV__ERR(errno);
|
|
|
|
/* Count the number of interfaces */
|
|
for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
|
|
if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
|
|
continue;
|
|
|
|
(*count)++;
|
|
}
|
|
|
|
if (*count == 0) {
|
|
freeifaddrs(addrs);
|
|
return 0;
|
|
}
|
|
|
|
/* Make sure the memory is initiallized to zero using calloc() */
|
|
*addresses = uv__calloc(*count, sizeof(**addresses));
|
|
if (!(*addresses)) {
|
|
freeifaddrs(addrs);
|
|
return UV_ENOMEM;
|
|
}
|
|
|
|
address = *addresses;
|
|
|
|
for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
|
|
if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
|
|
continue;
|
|
|
|
address->name = uv__strdup(ent->ifa_name);
|
|
|
|
if (ent->ifa_addr->sa_family == AF_INET6) {
|
|
address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
|
|
} else {
|
|
address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
|
|
}
|
|
|
|
if (ent->ifa_netmask->sa_family == AF_INET6) {
|
|
address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
|
|
} else {
|
|
address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
|
|
}
|
|
|
|
address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
|
|
|
|
address++;
|
|
}
|
|
|
|
/* Fill in physical addresses for each interface */
|
|
for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
|
|
if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
|
|
continue;
|
|
|
|
address = *addresses;
|
|
|
|
for (i = 0; i < (*count); i++) {
|
|
size_t namelen = strlen(ent->ifa_name);
|
|
/* Alias interface share the same physical address */
|
|
if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
|
|
(address->name[namelen] == 0 || address->name[namelen] == ':')) {
|
|
sll = (struct sockaddr_ll*)ent->ifa_addr;
|
|
memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
|
|
}
|
|
address++;
|
|
}
|
|
}
|
|
|
|
freeifaddrs(addrs);
|
|
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
|
|
void uv_free_interface_addresses(uv_interface_address_t* addresses,
|
|
int count) {
|
|
int i;
|
|
|
|
for (i = 0; i < count; i++) {
|
|
uv__free(addresses[i].name);
|
|
}
|
|
|
|
uv__free(addresses);
|
|
}
|
|
|
|
|
|
void uv__set_process_title(const char* title) {
|
|
#if defined(PR_SET_NAME)
|
|
prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */
|
|
#endif
|
|
}
|
|
|
|
|
|
static uint64_t uv__read_proc_meminfo(const char* what) {
|
|
uint64_t rc;
|
|
char* p;
|
|
char buf[4096]; /* Large enough to hold all of /proc/meminfo. */
|
|
|
|
if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
|
|
return 0;
|
|
|
|
p = strstr(buf, what);
|
|
|
|
if (p == NULL)
|
|
return 0;
|
|
|
|
p += strlen(what);
|
|
|
|
rc = 0;
|
|
sscanf(p, "%" PRIu64 " kB", &rc);
|
|
|
|
return rc * 1024;
|
|
}
|
|
|
|
|
|
uint64_t uv_get_free_memory(void) {
|
|
struct sysinfo info;
|
|
uint64_t rc;
|
|
|
|
rc = uv__read_proc_meminfo("MemAvailable:");
|
|
|
|
if (rc != 0)
|
|
return rc;
|
|
|
|
if (0 == sysinfo(&info))
|
|
return (uint64_t) info.freeram * info.mem_unit;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
uint64_t uv_get_total_memory(void) {
|
|
struct sysinfo info;
|
|
uint64_t rc;
|
|
|
|
rc = uv__read_proc_meminfo("MemTotal:");
|
|
|
|
if (rc != 0)
|
|
return rc;
|
|
|
|
if (0 == sysinfo(&info))
|
|
return (uint64_t) info.totalram * info.mem_unit;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
static uint64_t uv__read_uint64(const char* filename) {
|
|
char buf[32]; /* Large enough to hold an encoded uint64_t. */
|
|
uint64_t rc;
|
|
|
|
rc = 0;
|
|
if (0 == uv__slurp(filename, buf, sizeof(buf)))
|
|
if (1 != sscanf(buf, "%" PRIu64, &rc))
|
|
if (0 == strcmp(buf, "max\n"))
|
|
rc = UINT64_MAX;
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
|
|
* finds the location and length of the memory controller mount path.
|
|
* This disregards the leading / for easy concatenation of paths.
|
|
* Returns NULL if the memory controller wasn't found. */
|
|
static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
|
|
int* n) {
|
|
char* p;
|
|
|
|
/* Seek to the memory controller line. */
|
|
p = strchr(buf, ':');
|
|
while (p != NULL && strncmp(p, ":memory:", 8)) {
|
|
p = strchr(p, '\n');
|
|
if (p != NULL)
|
|
p = strchr(p, ':');
|
|
}
|
|
|
|
if (p != NULL) {
|
|
/* Determine the length of the mount path. */
|
|
p = p + strlen(":memory:/");
|
|
*n = (int) strcspn(p, "\n");
|
|
}
|
|
|
|
return p;
|
|
}
|
|
|
|
static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
|
|
uint64_t* max) {
|
|
char filename[4097];
|
|
char* p;
|
|
int n;
|
|
uint64_t cgroup1_max;
|
|
|
|
/* Find out where the controller is mounted. */
|
|
p = uv__cgroup1_find_memory_controller(buf, &n);
|
|
if (p != NULL) {
|
|
snprintf(filename, sizeof(filename),
|
|
"/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
|
|
*high = uv__read_uint64(filename);
|
|
|
|
snprintf(filename, sizeof(filename),
|
|
"/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
|
|
*max = uv__read_uint64(filename);
|
|
|
|
/* If the controller wasn't mounted, the reads above will have failed,
|
|
* as indicated by uv__read_uint64 returning 0.
|
|
*/
|
|
if (*high != 0 && *max != 0)
|
|
goto update_limits;
|
|
}
|
|
|
|
/* Fall back to the limits of the global memory controller. */
|
|
*high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
|
|
*max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
|
|
|
|
/* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
|
|
* cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
|
|
*/
|
|
update_limits:
|
|
cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
|
|
if (*high == cgroup1_max)
|
|
*high = UINT64_MAX;
|
|
if (*max == cgroup1_max)
|
|
*max = UINT64_MAX;
|
|
}
|
|
|
|
static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
|
|
uint64_t* max) {
|
|
char filename[4097];
|
|
char* p;
|
|
int n;
|
|
|
|
/* Find out where the controller is mounted. */
|
|
p = buf + strlen("0::/");
|
|
n = (int) strcspn(p, "\n");
|
|
|
|
/* Read the memory limits of the controller. */
|
|
snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
|
|
*max = uv__read_uint64(filename);
|
|
snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
|
|
*high = uv__read_uint64(filename);
|
|
}
|
|
|
|
static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
|
|
uint64_t high;
|
|
uint64_t max;
|
|
|
|
/* In the case of cgroupv2, we'll only have a single entry. */
|
|
if (strncmp(buf, "0::/", 4))
|
|
uv__get_cgroup1_memory_limits(buf, &high, &max);
|
|
else
|
|
uv__get_cgroup2_memory_limits(buf, &high, &max);
|
|
|
|
if (high == 0 || max == 0)
|
|
return 0;
|
|
|
|
return high < max ? high : max;
|
|
}
|
|
|
|
uint64_t uv_get_constrained_memory(void) {
|
|
char buf[1024];
|
|
|
|
if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
|
|
return 0;
|
|
|
|
return uv__get_cgroup_constrained_memory(buf);
|
|
}
|
|
|
|
|
|
static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
|
|
char filename[4097];
|
|
uint64_t current;
|
|
char* p;
|
|
int n;
|
|
|
|
/* Find out where the controller is mounted. */
|
|
p = uv__cgroup1_find_memory_controller(buf, &n);
|
|
if (p != NULL) {
|
|
snprintf(filename, sizeof(filename),
|
|
"/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
|
|
current = uv__read_uint64(filename);
|
|
|
|
/* If the controller wasn't mounted, the reads above will have failed,
|
|
* as indicated by uv__read_uint64 returning 0.
|
|
*/
|
|
if (current != 0)
|
|
return current;
|
|
}
|
|
|
|
/* Fall back to the usage of the global memory controller. */
|
|
return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
|
|
}
|
|
|
|
static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
|
|
char filename[4097];
|
|
char* p;
|
|
int n;
|
|
|
|
/* Find out where the controller is mounted. */
|
|
p = buf + strlen("0::/");
|
|
n = (int) strcspn(p, "\n");
|
|
|
|
snprintf(filename, sizeof(filename),
|
|
"/sys/fs/cgroup/%.*s/memory.current", n, p);
|
|
return uv__read_uint64(filename);
|
|
}
|
|
|
|
uint64_t uv_get_available_memory(void) {
|
|
char buf[1024];
|
|
uint64_t constrained;
|
|
uint64_t current;
|
|
uint64_t total;
|
|
|
|
if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
|
|
return 0;
|
|
|
|
constrained = uv__get_cgroup_constrained_memory(buf);
|
|
if (constrained == 0)
|
|
return uv_get_free_memory();
|
|
|
|
total = uv_get_total_memory();
|
|
if (constrained > total)
|
|
return uv_get_free_memory();
|
|
|
|
/* In the case of cgroupv2, we'll only have a single entry. */
|
|
if (strncmp(buf, "0::/", 4))
|
|
current = uv__get_cgroup1_current_memory(buf);
|
|
else
|
|
current = uv__get_cgroup2_current_memory(buf);
|
|
|
|
/* memory usage can be higher than the limit (for short bursts of time) */
|
|
if (constrained < current)
|
|
return 0;
|
|
|
|
return constrained - current;
|
|
}
|
|
|
|
|
|
void uv_loadavg(double avg[3]) {
|
|
struct sysinfo info;
|
|
char buf[128]; /* Large enough to hold all of /proc/loadavg. */
|
|
|
|
if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
|
|
if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
|
|
return;
|
|
|
|
if (sysinfo(&info) < 0)
|
|
return;
|
|
|
|
avg[0] = (double) info.loads[0] / 65536.0;
|
|
avg[1] = (double) info.loads[1] / 65536.0;
|
|
avg[2] = (double) info.loads[2] / 65536.0;
|
|
}
|
|
|
|
|
|
static int compare_watchers(const struct watcher_list* a,
|
|
const struct watcher_list* b) {
|
|
if (a->wd < b->wd) return -1;
|
|
if (a->wd > b->wd) return 1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
static int init_inotify(uv_loop_t* loop) {
|
|
int fd;
|
|
|
|
if (loop->inotify_fd != -1)
|
|
return 0;
|
|
|
|
fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
|
|
if (fd < 0)
|
|
return UV__ERR(errno);
|
|
|
|
loop->inotify_fd = fd;
|
|
uv__io_init(&loop->inotify_read_watcher, uv__inotify_read, loop->inotify_fd);
|
|
uv__io_start(loop, &loop->inotify_read_watcher, POLLIN);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
|
|
/* Open the inotify_fd, and re-arm all the inotify watchers. */
|
|
int err;
|
|
struct watcher_list* tmp_watcher_list_iter;
|
|
struct watcher_list* watcher_list;
|
|
struct watcher_list tmp_watcher_list;
|
|
struct uv__queue queue;
|
|
struct uv__queue* q;
|
|
uv_fs_event_t* handle;
|
|
char* tmp_path;
|
|
|
|
if (root == NULL)
|
|
return 0;
|
|
|
|
/* We must restore the old watcher list to be able to close items
|
|
* out of it.
|
|
*/
|
|
loop->inotify_watchers = root;
|
|
|
|
uv__queue_init(&tmp_watcher_list.watchers);
|
|
/* Note that the queue we use is shared with the start and stop()
|
|
* functions, making uv__queue_foreach unsafe to use. So we use the
|
|
* uv__queue_move trick to safely iterate. Also don't free the watcher
|
|
* list until we're done iterating. c.f. uv__inotify_read.
|
|
*/
|
|
RB_FOREACH_SAFE(watcher_list, watcher_root,
|
|
uv__inotify_watchers(loop), tmp_watcher_list_iter) {
|
|
watcher_list->iterating = 1;
|
|
uv__queue_move(&watcher_list->watchers, &queue);
|
|
while (!uv__queue_empty(&queue)) {
|
|
q = uv__queue_head(&queue);
|
|
handle = uv__queue_data(q, uv_fs_event_t, watchers);
|
|
/* It's critical to keep a copy of path here, because it
|
|
* will be set to NULL by stop() and then deallocated by
|
|
* maybe_free_watcher_list
|
|
*/
|
|
tmp_path = uv__strdup(handle->path);
|
|
assert(tmp_path != NULL);
|
|
uv__queue_remove(q);
|
|
uv__queue_insert_tail(&watcher_list->watchers, q);
|
|
uv_fs_event_stop(handle);
|
|
|
|
uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
|
|
handle->path = tmp_path;
|
|
}
|
|
watcher_list->iterating = 0;
|
|
maybe_free_watcher_list(watcher_list, loop);
|
|
}
|
|
|
|
uv__queue_move(&tmp_watcher_list.watchers, &queue);
|
|
while (!uv__queue_empty(&queue)) {
|
|
q = uv__queue_head(&queue);
|
|
uv__queue_remove(q);
|
|
handle = uv__queue_data(q, uv_fs_event_t, watchers);
|
|
tmp_path = handle->path;
|
|
handle->path = NULL;
|
|
err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
|
|
uv__free(tmp_path);
|
|
if (err)
|
|
return err;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
|
|
struct watcher_list w;
|
|
w.wd = wd;
|
|
return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
|
|
}
|
|
|
|
|
|
static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
|
|
/* if the watcher_list->watchers is being iterated over, we can't free it. */
|
|
if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
|
|
/* No watchers left for this path. Clean up. */
|
|
RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
|
|
inotify_rm_watch(loop->inotify_fd, w->wd);
|
|
uv__free(w);
|
|
}
|
|
}
|
|
|
|
|
|
static void uv__inotify_read(uv_loop_t* loop,
|
|
uv__io_t* dummy,
|
|
unsigned int events) {
|
|
const struct inotify_event* e;
|
|
struct watcher_list* w;
|
|
uv_fs_event_t* h;
|
|
struct uv__queue queue;
|
|
struct uv__queue* q;
|
|
const char* path;
|
|
ssize_t size;
|
|
const char *p;
|
|
/* needs to be large enough for sizeof(inotify_event) + strlen(path) */
|
|
char buf[4096];
|
|
|
|
for (;;) {
|
|
do
|
|
size = read(loop->inotify_fd, buf, sizeof(buf));
|
|
while (size == -1 && errno == EINTR);
|
|
|
|
if (size == -1) {
|
|
assert(errno == EAGAIN || errno == EWOULDBLOCK);
|
|
break;
|
|
}
|
|
|
|
assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
|
|
|
|
/* Now we have one or more inotify_event structs. */
|
|
for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
|
|
e = (const struct inotify_event*) p;
|
|
|
|
events = 0;
|
|
if (e->mask & (IN_ATTRIB|IN_MODIFY))
|
|
events |= UV_CHANGE;
|
|
if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
|
|
events |= UV_RENAME;
|
|
|
|
w = find_watcher(loop, e->wd);
|
|
if (w == NULL)
|
|
continue; /* Stale event, no watchers left. */
|
|
|
|
/* inotify does not return the filename when monitoring a single file
|
|
* for modifications. Repurpose the filename for API compatibility.
|
|
* I'm not convinced this is a good thing, maybe it should go.
|
|
*/
|
|
path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
|
|
|
|
/* We're about to iterate over the queue and call user's callbacks.
|
|
* What can go wrong?
|
|
* A callback could call uv_fs_event_stop()
|
|
* and the queue can change under our feet.
|
|
* So, we use uv__queue_move() trick to safely iterate over the queue.
|
|
* And we don't free the watcher_list until we're done iterating.
|
|
*
|
|
* First,
|
|
* tell uv_fs_event_stop() (that could be called from a user's callback)
|
|
* not to free watcher_list.
|
|
*/
|
|
w->iterating = 1;
|
|
uv__queue_move(&w->watchers, &queue);
|
|
while (!uv__queue_empty(&queue)) {
|
|
q = uv__queue_head(&queue);
|
|
h = uv__queue_data(q, uv_fs_event_t, watchers);
|
|
|
|
uv__queue_remove(q);
|
|
uv__queue_insert_tail(&w->watchers, q);
|
|
|
|
h->cb(h, path, events, 0);
|
|
}
|
|
/* done iterating, time to (maybe) free empty watcher_list */
|
|
w->iterating = 0;
|
|
maybe_free_watcher_list(w, loop);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
|
|
uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
|
|
return 0;
|
|
}
|
|
|
|
|
|
int uv_fs_event_start(uv_fs_event_t* handle,
|
|
uv_fs_event_cb cb,
|
|
const char* path,
|
|
unsigned int flags) {
|
|
struct watcher_list* w;
|
|
uv_loop_t* loop;
|
|
size_t len;
|
|
int events;
|
|
int err;
|
|
int wd;
|
|
|
|
if (uv__is_active(handle))
|
|
return UV_EINVAL;
|
|
|
|
loop = handle->loop;
|
|
|
|
err = init_inotify(loop);
|
|
if (err)
|
|
return err;
|
|
|
|
events = IN_ATTRIB
|
|
| IN_CREATE
|
|
| IN_MODIFY
|
|
| IN_DELETE
|
|
| IN_DELETE_SELF
|
|
| IN_MOVE_SELF
|
|
| IN_MOVED_FROM
|
|
| IN_MOVED_TO;
|
|
|
|
wd = inotify_add_watch(loop->inotify_fd, path, events);
|
|
if (wd == -1)
|
|
return UV__ERR(errno);
|
|
|
|
w = find_watcher(loop, wd);
|
|
if (w)
|
|
goto no_insert;
|
|
|
|
len = strlen(path) + 1;
|
|
w = uv__malloc(sizeof(*w) + len);
|
|
if (w == NULL)
|
|
return UV_ENOMEM;
|
|
|
|
w->wd = wd;
|
|
w->path = memcpy(w + 1, path, len);
|
|
uv__queue_init(&w->watchers);
|
|
w->iterating = 0;
|
|
RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
|
|
|
|
no_insert:
|
|
uv__handle_start(handle);
|
|
uv__queue_insert_tail(&w->watchers, &handle->watchers);
|
|
handle->path = w->path;
|
|
handle->cb = cb;
|
|
handle->wd = wd;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int uv_fs_event_stop(uv_fs_event_t* handle) {
|
|
struct watcher_list* w;
|
|
|
|
if (!uv__is_active(handle))
|
|
return 0;
|
|
|
|
w = find_watcher(handle->loop, handle->wd);
|
|
assert(w != NULL);
|
|
|
|
handle->wd = -1;
|
|
handle->path = NULL;
|
|
uv__handle_stop(handle);
|
|
uv__queue_remove(&handle->watchers);
|
|
|
|
maybe_free_watcher_list(w, handle->loop);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
void uv__fs_event_close(uv_fs_event_t* handle) {
|
|
uv_fs_event_stop(handle);
|
|
}
|