1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2 * Permission is hereby granted, free of charge, to any person obtaining a copy
3 * of this software and associated documentation files (the "Software"), to
4 * deal in the Software without restriction, including without limitation the
5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6 * sell copies of the Software, and to permit persons to whom the Software is
7 * furnished to do so, subject to the following conditions:
8 *
9 * The above copyright notice and this permission notice shall be included in
10 * all copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18 * IN THE SOFTWARE.
19 */
20
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22 * EPOLL* counterparts. We use the POLL* variants in this file because that
23 * is what libuv uses elsewhere.
24 */
25
26 #include "uv.h"
27 #include "internal.h"
28
29 #include <inttypes.h>
30 #include <stdatomic.h>
31 #include <stddef.h> /* offsetof */
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <errno.h>
38
39 #include <fcntl.h>
40 #include <ifaddrs.h>
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <netpacket/packet.h>
44 #include <sys/epoll.h>
45 #include <sys/inotify.h>
46 #include <sys/mman.h>
47 #include <sys/param.h>
48 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/stat.h>
51 #include <sys/syscall.h>
52 #include <sys/sysinfo.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/utsname.h>
56 #include <time.h>
57 #include <unistd.h>
58
59 #ifndef __NR_io_uring_setup
60 # define __NR_io_uring_setup 425
61 #endif
62
63 #ifndef __NR_io_uring_enter
64 # define __NR_io_uring_enter 426
65 #endif
66
67 #ifndef __NR_io_uring_register
68 # define __NR_io_uring_register 427
69 #endif
70
71 #ifndef __NR_copy_file_range
72 # if defined(__x86_64__)
73 # define __NR_copy_file_range 326
74 # elif defined(__i386__)
75 # define __NR_copy_file_range 377
76 # elif defined(__s390__)
77 # define __NR_copy_file_range 375
78 # elif defined(__arm__)
79 # define __NR_copy_file_range 391
80 # elif defined(__aarch64__)
81 # define __NR_copy_file_range 285
82 # elif defined(__powerpc__)
83 # define __NR_copy_file_range 379
84 # elif defined(__arc__)
85 # define __NR_copy_file_range 285
86 # elif defined(__riscv)
87 # define __NR_copy_file_range 285
88 # endif
89 #endif /* __NR_copy_file_range */
90
91 #ifndef __NR_statx
92 # if defined(__x86_64__)
93 # define __NR_statx 332
94 # elif defined(__i386__)
95 # define __NR_statx 383
96 # elif defined(__aarch64__)
97 # define __NR_statx 397
98 # elif defined(__arm__)
99 # define __NR_statx 397
100 # elif defined(__ppc__)
101 # define __NR_statx 383
102 # elif defined(__s390__)
103 # define __NR_statx 379
104 # elif defined(__riscv)
105 # define __NR_statx 291
106 # endif
107 #endif /* __NR_statx */
108
109 #ifndef __NR_getrandom
110 # if defined(__x86_64__)
111 # define __NR_getrandom 318
112 # elif defined(__i386__)
113 # define __NR_getrandom 355
114 # elif defined(__aarch64__)
115 # define __NR_getrandom 384
116 # elif defined(__arm__)
117 # define __NR_getrandom 384
118 # elif defined(__ppc__)
119 # define __NR_getrandom 359
120 # elif defined(__s390__)
121 # define __NR_getrandom 349
122 # elif defined(__riscv)
123 # define __NR_getrandom 278
124 # endif
125 #endif /* __NR_getrandom */
126
127 enum {
128 UV__IORING_SETUP_SQPOLL = 2u,
129 };
130
131 enum {
132 UV__IORING_FEAT_SINGLE_MMAP = 1u,
133 UV__IORING_FEAT_NODROP = 2u,
134 UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */
135 };
136
137 enum {
138 UV__IORING_OP_READV = 1,
139 UV__IORING_OP_WRITEV = 2,
140 UV__IORING_OP_FSYNC = 3,
141 UV__IORING_OP_OPENAT = 18,
142 UV__IORING_OP_CLOSE = 19,
143 UV__IORING_OP_STATX = 21,
144 UV__IORING_OP_EPOLL_CTL = 29,
145 UV__IORING_OP_RENAMEAT = 35,
146 UV__IORING_OP_UNLINKAT = 36,
147 UV__IORING_OP_MKDIRAT = 37,
148 UV__IORING_OP_SYMLINKAT = 38,
149 UV__IORING_OP_LINKAT = 39,
150 };
151
152 enum {
153 UV__IORING_ENTER_GETEVENTS = 1u,
154 UV__IORING_ENTER_SQ_WAKEUP = 2u,
155 };
156
157 enum {
158 UV__IORING_SQ_NEED_WAKEUP = 1u,
159 UV__IORING_SQ_CQ_OVERFLOW = 2u,
160 };
161
162 enum {
163 UV__MKDIRAT_SYMLINKAT_LINKAT = 1u,
164 };
165
166 struct uv__io_cqring_offsets {
167 uint32_t head;
168 uint32_t tail;
169 uint32_t ring_mask;
170 uint32_t ring_entries;
171 uint32_t overflow;
172 uint32_t cqes;
173 uint64_t reserved0;
174 uint64_t reserved1;
175 };
176
177 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
178
179 struct uv__io_sqring_offsets {
180 uint32_t head;
181 uint32_t tail;
182 uint32_t ring_mask;
183 uint32_t ring_entries;
184 uint32_t flags;
185 uint32_t dropped;
186 uint32_t array;
187 uint32_t reserved0;
188 uint64_t reserved1;
189 };
190
191 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
192
193 struct uv__io_uring_cqe {
194 uint64_t user_data;
195 int32_t res;
196 uint32_t flags;
197 };
198
199 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
200
201 struct uv__io_uring_sqe {
202 uint8_t opcode;
203 uint8_t flags;
204 uint16_t ioprio;
205 int32_t fd;
206 union {
207 uint64_t off;
208 uint64_t addr2;
209 };
210 union {
211 uint64_t addr;
212 };
213 uint32_t len;
214 union {
215 uint32_t rw_flags;
216 uint32_t fsync_flags;
217 uint32_t open_flags;
218 uint32_t statx_flags;
219 };
220 uint64_t user_data;
221 union {
222 uint16_t buf_index;
223 uint64_t pad[3];
224 };
225 };
226
227 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
228 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
229 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
230 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
231 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
232 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
233 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
234 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
235 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
236 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
237 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
238
239 struct uv__io_uring_params {
240 uint32_t sq_entries;
241 uint32_t cq_entries;
242 uint32_t flags;
243 uint32_t sq_thread_cpu;
244 uint32_t sq_thread_idle;
245 uint32_t features;
246 uint32_t reserved[4];
247 struct uv__io_sqring_offsets sq_off; /* 40 bytes */
248 struct uv__io_cqring_offsets cq_off; /* 40 bytes */
249 };
250
251 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
252 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
253 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
254
255 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
256 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
257 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
258
259 struct watcher_list {
260 RB_ENTRY(watcher_list) entry;
261 struct uv__queue watchers;
262 int iterating;
263 char* path;
264 int wd;
265 };
266
267 struct watcher_root {
268 struct watcher_list* rbh_root;
269 };
270
271 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
272 static void uv__inotify_read(uv_loop_t* loop,
273 uv__io_t* w,
274 unsigned int revents);
275 static int compare_watchers(const struct watcher_list* a,
276 const struct watcher_list* b);
277 static void maybe_free_watcher_list(struct watcher_list* w,
278 uv_loop_t* loop);
279
280 static void uv__epoll_ctl_flush(int epollfd,
281 struct uv__iou* ctl,
282 struct epoll_event (*events)[256]);
283
284 static void uv__epoll_ctl_prep(int epollfd,
285 struct uv__iou* ctl,
286 struct epoll_event (*events)[256],
287 int op,
288 int fd,
289 struct epoll_event* e);
290
RB_GENERATE_STATIC(watcher_root,watcher_list,entry,compare_watchers)291 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
292
293
294 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
295 /* This cast works because watcher_root is a struct with a pointer as its
296 * sole member. Such type punning is unsafe in the presence of strict
297 * pointer aliasing (and is just plain nasty) but that is why libuv
298 * is compiled with -fno-strict-aliasing.
299 */
300 return (struct watcher_root*) &loop->inotify_watchers;
301 }
302
303
uv__kernel_version(void)304 unsigned uv__kernel_version(void) {
305 static _Atomic unsigned cached_version;
306 struct utsname u;
307 unsigned version;
308 unsigned major;
309 unsigned minor;
310 unsigned patch;
311 char v_sig[256];
312 char* needle;
313
314 version = atomic_load_explicit(&cached_version, memory_order_relaxed);
315 if (version != 0)
316 return version;
317
318 /* Check /proc/version_signature first as it's the way to get the mainline
319 * kernel version in Ubuntu. The format is:
320 * Ubuntu ubuntu_kernel_version mainline_kernel_version
321 * For example:
322 * Ubuntu 5.15.0-79.86-generic 5.15.111
323 */
324 if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
325 if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
326 goto calculate_version;
327
328 if (-1 == uname(&u))
329 return 0;
330
331 /* In Debian we need to check `version` instead of `release` to extract the
332 * mainline kernel version. This is an example of how it looks like:
333 * #1 SMP Debian 5.10.46-4 (2021-08-03)
334 */
335 needle = strstr(u.version, "Debian ");
336 if (needle != NULL)
337 if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
338 goto calculate_version;
339
340 if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
341 return 0;
342
343 /* Handle it when the process runs under the UNAME26 personality:
344 *
345 * - kernels >= 3.x identify as 2.6.40+x
346 * - kernels >= 4.x identify as 2.6.60+x
347 *
348 * UNAME26 is a poorly conceived hack that doesn't let us distinguish
349 * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
350 * that 2.6.60+x means 4.x.
351 *
352 * Fun fact of the day: it's technically possible to observe the actual
353 * kernel version for a brief moment because uname() first copies out the
354 * real release string before overwriting it with the backcompat string.
355 */
356 if (major == 2 && minor == 6) {
357 if (patch >= 60) {
358 major = 4;
359 minor = patch - 60;
360 patch = 0;
361 } else if (patch >= 40) {
362 major = 3;
363 minor = patch - 40;
364 patch = 0;
365 }
366 }
367
368 calculate_version:
369 version = major * 65536 + minor * 256 + patch;
370 atomic_store_explicit(&cached_version, version, memory_order_relaxed);
371
372 return version;
373 }
374
375
376 ssize_t
uv__fs_copy_file_range(int fd_in,off_t * off_in,int fd_out,off_t * off_out,size_t len,unsigned int flags)377 uv__fs_copy_file_range(int fd_in,
378 off_t* off_in,
379 int fd_out,
380 off_t* off_out,
381 size_t len,
382 unsigned int flags)
383 {
384 #ifdef __NR_copy_file_range
385 return syscall(__NR_copy_file_range,
386 fd_in,
387 off_in,
388 fd_out,
389 off_out,
390 len,
391 flags);
392 #else
393 return errno = ENOSYS, -1;
394 #endif
395 }
396
397
uv__statx(int dirfd,const char * path,int flags,unsigned int mask,struct uv__statx * statxbuf)398 int uv__statx(int dirfd,
399 const char* path,
400 int flags,
401 unsigned int mask,
402 struct uv__statx* statxbuf) {
403 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
404 return errno = ENOSYS, -1;
405 #else
406 int rc;
407
408 rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
409 if (rc >= 0)
410 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
411
412 return rc;
413 #endif
414 }
415
416
uv__getrandom(void * buf,size_t buflen,unsigned flags)417 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
418 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
419 return errno = ENOSYS, -1;
420 #else
421 ssize_t rc;
422
423 rc = syscall(__NR_getrandom, buf, buflen, flags);
424 if (rc >= 0)
425 uv__msan_unpoison(buf, buflen);
426
427 return rc;
428 #endif
429 }
430
431
uv__io_uring_setup(int entries,struct uv__io_uring_params * params)432 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
433 return syscall(__NR_io_uring_setup, entries, params);
434 }
435
436
uv__io_uring_enter(int fd,unsigned to_submit,unsigned min_complete,unsigned flags)437 int uv__io_uring_enter(int fd,
438 unsigned to_submit,
439 unsigned min_complete,
440 unsigned flags) {
441 /* io_uring_enter used to take a sigset_t but it's unused
442 * in newer kernels unless IORING_ENTER_EXT_ARG is set,
443 * in which case it takes a struct io_uring_getevents_arg.
444 */
445 return syscall(__NR_io_uring_enter,
446 fd,
447 to_submit,
448 min_complete,
449 flags,
450 NULL,
451 0L);
452 }
453
454
uv__io_uring_register(int fd,unsigned opcode,void * arg,unsigned nargs)455 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
456 return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
457 }
458
459
uv__use_io_uring(void)460 static int uv__use_io_uring(void) {
461 #if defined(__ANDROID_API__)
462 return 0; /* Possibly available but blocked by seccomp. */
463 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
464 /* See https://github.com/libuv/libuv/issues/4158. */
465 return 0; /* All 32 bits kernels appear buggy. */
466 #elif defined(__powerpc64__) || defined(__ppc64__)
467 /* See https://github.com/libuv/libuv/issues/4283. */
468 return 0; /* Random SIGSEGV in signal handler. */
469 #else
470 /* Ternary: unknown=0, yes=1, no=-1 */
471 static _Atomic int use_io_uring;
472 char* val;
473 int use;
474
475 use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
476
477 if (use == 0) {
478 use = uv__kernel_version() >=
479 #if defined(__hppa__)
480 /* io_uring first supported on parisc in 6.1, functional in .51 */
481 /* https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/ */
482 /* 6.1.51 */ 0x060133
483 #else
484 /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
485 /* 5.10.186 */ 0x050ABA
486 #endif
487 ? 1 : -1;
488
489 /* But users can still enable it if they so desire. */
490 val = getenv("UV_USE_IO_URING");
491 if (val != NULL)
492 use = atoi(val) ? 1 : -1;
493
494 atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
495 }
496
497 return use > 0;
498 #endif
499 }
500
501
uv__iou_init(int epollfd,struct uv__iou * iou,uint32_t entries,uint32_t flags)502 static void uv__iou_init(int epollfd,
503 struct uv__iou* iou,
504 uint32_t entries,
505 uint32_t flags) {
506 struct uv__io_uring_params params;
507 struct epoll_event e;
508 size_t cqlen;
509 size_t sqlen;
510 size_t maxlen;
511 size_t sqelen;
512 uint32_t i;
513 char* sq;
514 char* sqe;
515 int ringfd;
516
517 sq = MAP_FAILED;
518 sqe = MAP_FAILED;
519
520 if (!uv__use_io_uring())
521 return;
522
523 /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
524 * Mostly academic because we check for a v5.13 kernel afterwards anyway.
525 */
526 memset(¶ms, 0, sizeof(params));
527 params.flags = flags;
528
529 if (flags & UV__IORING_SETUP_SQPOLL)
530 params.sq_thread_idle = 10; /* milliseconds */
531
532 /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
533 ringfd = uv__io_uring_setup(entries, ¶ms);
534 if (ringfd == -1)
535 return;
536
537 /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
538 * actually detecting is whether IORING_OP_STATX works with SQPOLL.
539 */
540 if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
541 goto fail;
542
543 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
544 if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
545 goto fail;
546
547 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
548 if (!(params.features & UV__IORING_FEAT_NODROP))
549 goto fail;
550
551 sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
552 cqlen =
553 params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
554 maxlen = sqlen < cqlen ? cqlen : sqlen;
555 sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
556
557 sq = mmap(0,
558 maxlen,
559 PROT_READ | PROT_WRITE,
560 MAP_SHARED | MAP_POPULATE,
561 ringfd,
562 0); /* IORING_OFF_SQ_RING */
563
564 sqe = mmap(0,
565 sqelen,
566 PROT_READ | PROT_WRITE,
567 MAP_SHARED | MAP_POPULATE,
568 ringfd,
569 0x10000000ull); /* IORING_OFF_SQES */
570
571 if (sq == MAP_FAILED || sqe == MAP_FAILED)
572 goto fail;
573
574 if (flags & UV__IORING_SETUP_SQPOLL) {
575 /* Only interested in completion events. To get notified when
576 * the kernel pulls items from the submission ring, add POLLOUT.
577 */
578 memset(&e, 0, sizeof(e));
579 e.events = POLLIN;
580 e.data.fd = ringfd;
581
582 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
583 goto fail;
584 }
585
586 iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
587 iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
588 iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
589 iou->sqarray = (uint32_t*) (sq + params.sq_off.array);
590 iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
591 iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
592 iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
593 iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
594 iou->sq = sq;
595 iou->cqe = sq + params.cq_off.cqes;
596 iou->sqe = sqe;
597 iou->sqlen = sqlen;
598 iou->cqlen = cqlen;
599 iou->maxlen = maxlen;
600 iou->sqelen = sqelen;
601 iou->ringfd = ringfd;
602 iou->in_flight = 0;
603 iou->flags = 0;
604
605 if (uv__kernel_version() >= /* 5.15.0 */ 0x050F00)
606 iou->flags |= UV__MKDIRAT_SYMLINKAT_LINKAT;
607
608 for (i = 0; i <= iou->sqmask; i++)
609 iou->sqarray[i] = i; /* Slot -> sqe identity mapping. */
610
611 return;
612
613 fail:
614 if (sq != MAP_FAILED)
615 munmap(sq, maxlen);
616
617 if (sqe != MAP_FAILED)
618 munmap(sqe, sqelen);
619
620 uv__close(ringfd);
621 }
622
623
uv__iou_delete(struct uv__iou * iou)624 static void uv__iou_delete(struct uv__iou* iou) {
625 if (iou->ringfd > -1) {
626 munmap(iou->sq, iou->maxlen);
627 munmap(iou->sqe, iou->sqelen);
628 uv__close(iou->ringfd);
629 iou->ringfd = -1;
630 }
631 }
632
633
uv__platform_loop_init(uv_loop_t * loop)634 int uv__platform_loop_init(uv_loop_t* loop) {
635 uv__loop_internal_fields_t* lfields;
636
637 lfields = uv__get_internal_fields(loop);
638 lfields->ctl.ringfd = -1;
639 lfields->iou.ringfd = -2; /* "uninitialized" */
640
641 loop->inotify_watchers = NULL;
642 loop->inotify_fd = -1;
643 loop->backend_fd = epoll_create1(O_CLOEXEC);
644
645 if (loop->backend_fd == -1)
646 return UV__ERR(errno);
647
648 uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
649
650 return 0;
651 }
652
653
uv__io_fork(uv_loop_t * loop)654 int uv__io_fork(uv_loop_t* loop) {
655 int err;
656 struct watcher_list* root;
657
658 root = uv__inotify_watchers(loop)->rbh_root;
659
660 uv__close(loop->backend_fd);
661 loop->backend_fd = -1;
662
663 /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
664 uv__platform_loop_delete(loop);
665
666 err = uv__platform_loop_init(loop);
667 if (err)
668 return err;
669
670 return uv__inotify_fork(loop, root);
671 }
672
673
uv__platform_loop_delete(uv_loop_t * loop)674 void uv__platform_loop_delete(uv_loop_t* loop) {
675 uv__loop_internal_fields_t* lfields;
676
677 lfields = uv__get_internal_fields(loop);
678 uv__iou_delete(&lfields->ctl);
679 uv__iou_delete(&lfields->iou);
680
681 if (loop->inotify_fd != -1) {
682 uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
683 uv__close(loop->inotify_fd);
684 loop->inotify_fd = -1;
685 }
686 }
687
688
689 struct uv__invalidate {
690 struct epoll_event (*prep)[256];
691 struct epoll_event* events;
692 int nfds;
693 };
694
695
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)696 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
697 uv__loop_internal_fields_t* lfields;
698 struct uv__invalidate* inv;
699 struct epoll_event dummy;
700 int i;
701
702 lfields = uv__get_internal_fields(loop);
703 inv = lfields->inv;
704
705 /* Invalidate events with same file descriptor */
706 if (inv != NULL)
707 for (i = 0; i < inv->nfds; i++)
708 if (inv->events[i].data.fd == fd)
709 inv->events[i].data.fd = -1;
710
711 /* Remove the file descriptor from the epoll.
712 * This avoids a problem where the same file description remains open
713 * in another process, causing repeated junk epoll events.
714 *
715 * Perform EPOLL_CTL_DEL immediately instead of going through
716 * io_uring's submit queue, otherwise the file descriptor may
717 * be closed by the time the kernel starts the operation.
718 *
719 * We pass in a dummy epoll_event, to work around a bug in old kernels.
720 *
721 * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
722 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
723 */
724 memset(&dummy, 0, sizeof(dummy));
725 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
726 }
727
728
uv__io_check_fd(uv_loop_t * loop,int fd)729 int uv__io_check_fd(uv_loop_t* loop, int fd) {
730 struct epoll_event e;
731 int rc;
732
733 memset(&e, 0, sizeof(e));
734 e.events = POLLIN;
735 e.data.fd = -1;
736
737 rc = 0;
738 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
739 if (errno != EEXIST)
740 rc = UV__ERR(errno);
741
742 if (rc == 0)
743 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
744 abort();
745
746 return rc;
747 }
748
749
750 /* Caller must initialize SQE and call uv__iou_submit(). */
uv__iou_get_sqe(struct uv__iou * iou,uv_loop_t * loop,uv_fs_t * req)751 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
752 uv_loop_t* loop,
753 uv_fs_t* req) {
754 struct uv__io_uring_sqe* sqe;
755 uint32_t head;
756 uint32_t tail;
757 uint32_t mask;
758 uint32_t slot;
759
760 /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
761 * initialization failed. Anything else is a valid ring file descriptor.
762 */
763 if (iou->ringfd == -2) {
764 /* By default, the SQPOLL is not created. Enable only if the loop is
765 * configured with UV_LOOP_USE_IO_URING_SQPOLL.
766 */
767 if ((loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL) == 0) {
768 iou->ringfd = -1;
769 return NULL;
770 }
771
772 uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
773 if (iou->ringfd == -2)
774 iou->ringfd = -1; /* "failed" */
775 }
776
777 if (iou->ringfd == -1)
778 return NULL;
779
780 head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
781 memory_order_acquire);
782 tail = *iou->sqtail;
783 mask = iou->sqmask;
784
785 if ((head & mask) == ((tail + 1) & mask))
786 return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
787
788 slot = tail & mask;
789 sqe = iou->sqe;
790 sqe = &sqe[slot];
791 memset(sqe, 0, sizeof(*sqe));
792 sqe->user_data = (uintptr_t) req;
793
794 /* Pacify uv_cancel(). */
795 req->work_req.loop = loop;
796 req->work_req.work = NULL;
797 req->work_req.done = NULL;
798 uv__queue_init(&req->work_req.wq);
799
800 uv__req_register(loop);
801 iou->in_flight++;
802
803 return sqe;
804 }
805
806
uv__iou_submit(struct uv__iou * iou)807 static void uv__iou_submit(struct uv__iou* iou) {
808 uint32_t flags;
809
810 atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
811 *iou->sqtail + 1,
812 memory_order_release);
813
814 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
815 memory_order_acquire);
816
817 if (flags & UV__IORING_SQ_NEED_WAKEUP)
818 if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
819 if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */
820 perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */
821 }
822
823
uv__iou_fs_close(uv_loop_t * loop,uv_fs_t * req)824 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
825 struct uv__io_uring_sqe* sqe;
826 struct uv__iou* iou;
827 int kv;
828
829 kv = uv__kernel_version();
830 /* Work around a poorly understood bug in older kernels where closing a file
831 * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
832 * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
833 * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
834 * but good candidates are the several data race fixes. Interestingly, it
835 * seems to manifest only when running under Docker so the possibility of
836 * a Docker bug can't be completely ruled out either. Yay, computers.
837 * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
838 * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
839 * solved.
840 */
841 if (kv < /* 5.15.90 */ 0x050F5A)
842 return 0;
843
844 if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
845 return 0;
846
847
848 iou = &uv__get_internal_fields(loop)->iou;
849
850 sqe = uv__iou_get_sqe(iou, loop, req);
851 if (sqe == NULL)
852 return 0;
853
854 sqe->fd = req->file;
855 sqe->opcode = UV__IORING_OP_CLOSE;
856
857 uv__iou_submit(iou);
858
859 return 1;
860 }
861
862
uv__iou_fs_fsync_or_fdatasync(uv_loop_t * loop,uv_fs_t * req,uint32_t fsync_flags)863 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
864 uv_fs_t* req,
865 uint32_t fsync_flags) {
866 struct uv__io_uring_sqe* sqe;
867 struct uv__iou* iou;
868
869 iou = &uv__get_internal_fields(loop)->iou;
870
871 sqe = uv__iou_get_sqe(iou, loop, req);
872 if (sqe == NULL)
873 return 0;
874
875 /* Little known fact: setting seq->off and seq->len turns
876 * it into an asynchronous sync_file_range() operation.
877 */
878 sqe->fd = req->file;
879 sqe->fsync_flags = fsync_flags;
880 sqe->opcode = UV__IORING_OP_FSYNC;
881
882 uv__iou_submit(iou);
883
884 return 1;
885 }
886
887
uv__iou_fs_link(uv_loop_t * loop,uv_fs_t * req)888 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
889 struct uv__io_uring_sqe* sqe;
890 struct uv__iou* iou;
891
892 iou = &uv__get_internal_fields(loop)->iou;
893
894 if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
895 return 0;
896
897 sqe = uv__iou_get_sqe(iou, loop, req);
898 if (sqe == NULL)
899 return 0;
900
901 sqe->addr = (uintptr_t) req->path;
902 sqe->fd = AT_FDCWD;
903 sqe->addr2 = (uintptr_t) req->new_path;
904 sqe->len = AT_FDCWD;
905 sqe->opcode = UV__IORING_OP_LINKAT;
906
907 uv__iou_submit(iou);
908
909 return 1;
910 }
911
912
uv__iou_fs_mkdir(uv_loop_t * loop,uv_fs_t * req)913 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
914 struct uv__io_uring_sqe* sqe;
915 struct uv__iou* iou;
916
917 iou = &uv__get_internal_fields(loop)->iou;
918
919 if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
920 return 0;
921
922 sqe = uv__iou_get_sqe(iou, loop, req);
923 if (sqe == NULL)
924 return 0;
925
926 sqe->addr = (uintptr_t) req->path;
927 sqe->fd = AT_FDCWD;
928 sqe->len = req->mode;
929 sqe->opcode = UV__IORING_OP_MKDIRAT;
930
931 uv__iou_submit(iou);
932
933 return 1;
934 }
935
936
uv__iou_fs_open(uv_loop_t * loop,uv_fs_t * req)937 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
938 struct uv__io_uring_sqe* sqe;
939 struct uv__iou* iou;
940
941 iou = &uv__get_internal_fields(loop)->iou;
942
943 sqe = uv__iou_get_sqe(iou, loop, req);
944 if (sqe == NULL)
945 return 0;
946
947 sqe->addr = (uintptr_t) req->path;
948 sqe->fd = AT_FDCWD;
949 sqe->len = req->mode;
950 sqe->opcode = UV__IORING_OP_OPENAT;
951 sqe->open_flags = req->flags | O_CLOEXEC;
952
953 uv__iou_submit(iou);
954
955 return 1;
956 }
957
958
uv__iou_fs_rename(uv_loop_t * loop,uv_fs_t * req)959 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
960 struct uv__io_uring_sqe* sqe;
961 struct uv__iou* iou;
962
963 iou = &uv__get_internal_fields(loop)->iou;
964
965 sqe = uv__iou_get_sqe(iou, loop, req);
966 if (sqe == NULL)
967 return 0;
968
969 sqe->addr = (uintptr_t) req->path;
970 sqe->fd = AT_FDCWD;
971 sqe->addr2 = (uintptr_t) req->new_path;
972 sqe->len = AT_FDCWD;
973 sqe->opcode = UV__IORING_OP_RENAMEAT;
974
975 uv__iou_submit(iou);
976
977 return 1;
978 }
979
980
uv__iou_fs_symlink(uv_loop_t * loop,uv_fs_t * req)981 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
982 struct uv__io_uring_sqe* sqe;
983 struct uv__iou* iou;
984
985 iou = &uv__get_internal_fields(loop)->iou;
986
987 if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
988 return 0;
989
990 sqe = uv__iou_get_sqe(iou, loop, req);
991 if (sqe == NULL)
992 return 0;
993
994 sqe->addr = (uintptr_t) req->path;
995 sqe->fd = AT_FDCWD;
996 sqe->addr2 = (uintptr_t) req->new_path;
997 sqe->opcode = UV__IORING_OP_SYMLINKAT;
998
999 uv__iou_submit(iou);
1000
1001 return 1;
1002 }
1003
1004
uv__iou_fs_unlink(uv_loop_t * loop,uv_fs_t * req)1005 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
1006 struct uv__io_uring_sqe* sqe;
1007 struct uv__iou* iou;
1008
1009 iou = &uv__get_internal_fields(loop)->iou;
1010
1011 sqe = uv__iou_get_sqe(iou, loop, req);
1012 if (sqe == NULL)
1013 return 0;
1014
1015 sqe->addr = (uintptr_t) req->path;
1016 sqe->fd = AT_FDCWD;
1017 sqe->opcode = UV__IORING_OP_UNLINKAT;
1018
1019 uv__iou_submit(iou);
1020
1021 return 1;
1022 }
1023
1024
uv__iou_fs_read_or_write(uv_loop_t * loop,uv_fs_t * req,int is_read)1025 int uv__iou_fs_read_or_write(uv_loop_t* loop,
1026 uv_fs_t* req,
1027 int is_read) {
1028 struct uv__io_uring_sqe* sqe;
1029 struct uv__iou* iou;
1030
1031 /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1032 * to the threadpool on writes */
1033 if (req->nbufs > IOV_MAX) {
1034 if (is_read)
1035 req->nbufs = IOV_MAX;
1036 else
1037 return 0;
1038 }
1039
1040 iou = &uv__get_internal_fields(loop)->iou;
1041
1042 sqe = uv__iou_get_sqe(iou, loop, req);
1043 if (sqe == NULL)
1044 return 0;
1045
1046 sqe->addr = (uintptr_t) req->bufs;
1047 sqe->fd = req->file;
1048 sqe->len = req->nbufs;
1049 sqe->off = req->off < 0 ? -1 : req->off;
1050 sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1051
1052 uv__iou_submit(iou);
1053
1054 return 1;
1055 }
1056
1057
uv__iou_fs_statx(uv_loop_t * loop,uv_fs_t * req,int is_fstat,int is_lstat)1058 int uv__iou_fs_statx(uv_loop_t* loop,
1059 uv_fs_t* req,
1060 int is_fstat,
1061 int is_lstat) {
1062 struct uv__io_uring_sqe* sqe;
1063 struct uv__statx* statxbuf;
1064 struct uv__iou* iou;
1065
1066 statxbuf = uv__malloc(sizeof(*statxbuf));
1067 if (statxbuf == NULL)
1068 return 0;
1069
1070 iou = &uv__get_internal_fields(loop)->iou;
1071
1072 sqe = uv__iou_get_sqe(iou, loop, req);
1073 if (sqe == NULL) {
1074 uv__free(statxbuf);
1075 return 0;
1076 }
1077
1078 req->ptr = statxbuf;
1079
1080 sqe->addr = (uintptr_t) req->path;
1081 sqe->addr2 = (uintptr_t) statxbuf;
1082 sqe->fd = AT_FDCWD;
1083 sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1084 sqe->opcode = UV__IORING_OP_STATX;
1085
1086 if (is_fstat) {
1087 sqe->addr = (uintptr_t) "";
1088 sqe->fd = req->file;
1089 sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1090 }
1091
1092 if (is_lstat)
1093 sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1094
1095 uv__iou_submit(iou);
1096
1097 return 1;
1098 }
1099
1100
uv__statx_to_stat(const struct uv__statx * statxbuf,uv_stat_t * buf)1101 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1102 buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1103 buf->st_mode = statxbuf->stx_mode;
1104 buf->st_nlink = statxbuf->stx_nlink;
1105 buf->st_uid = statxbuf->stx_uid;
1106 buf->st_gid = statxbuf->stx_gid;
1107 buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1108 buf->st_ino = statxbuf->stx_ino;
1109 buf->st_size = statxbuf->stx_size;
1110 buf->st_blksize = statxbuf->stx_blksize;
1111 buf->st_blocks = statxbuf->stx_blocks;
1112 buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1113 buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1114 buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1115 buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1116 buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1117 buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1118 buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1119 buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1120 buf->st_flags = 0;
1121 buf->st_gen = 0;
1122 }
1123
1124
uv__iou_fs_statx_post(uv_fs_t * req)1125 static void uv__iou_fs_statx_post(uv_fs_t* req) {
1126 struct uv__statx* statxbuf;
1127 uv_stat_t* buf;
1128
1129 buf = &req->statbuf;
1130 statxbuf = req->ptr;
1131 req->ptr = NULL;
1132
1133 if (req->result == 0) {
1134 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1135 uv__statx_to_stat(statxbuf, buf);
1136 req->ptr = buf;
1137 }
1138
1139 uv__free(statxbuf);
1140 }
1141
1142
uv__poll_io_uring(uv_loop_t * loop,struct uv__iou * iou)1143 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1144 struct uv__io_uring_cqe* cqe;
1145 struct uv__io_uring_cqe* e;
1146 uv_fs_t* req;
1147 uint32_t head;
1148 uint32_t tail;
1149 uint32_t mask;
1150 uint32_t i;
1151 uint32_t flags;
1152 int nevents;
1153 int rc;
1154
1155 head = *iou->cqhead;
1156 tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1157 memory_order_acquire);
1158 mask = iou->cqmask;
1159 cqe = iou->cqe;
1160 nevents = 0;
1161
1162 for (i = head; i != tail; i++) {
1163 e = &cqe[i & mask];
1164
1165 req = (uv_fs_t*) (uintptr_t) e->user_data;
1166 assert(req->type == UV_FS);
1167
1168 uv__req_unregister(loop);
1169 iou->in_flight--;
1170
1171 /* If the op is not supported by the kernel retry using the thread pool */
1172 if (e->res == -EOPNOTSUPP) {
1173 uv__fs_post(loop, req);
1174 continue;
1175 }
1176
1177 /* io_uring stores error codes as negative numbers, same as libuv. */
1178 req->result = e->res;
1179
1180 switch (req->fs_type) {
1181 case UV_FS_FSTAT:
1182 case UV_FS_LSTAT:
1183 case UV_FS_STAT:
1184 uv__iou_fs_statx_post(req);
1185 break;
1186 default: /* Squelch -Wswitch warnings. */
1187 break;
1188 }
1189
1190 uv__metrics_update_idle_time(loop);
1191 req->cb(req);
1192 nevents++;
1193 }
1194
1195 atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1196 tail,
1197 memory_order_release);
1198
1199 /* Check whether CQE's overflowed, if so enter the kernel to make them
1200 * available. Don't grab them immediately but in the next loop iteration to
1201 * avoid loop starvation. */
1202 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1203 memory_order_acquire);
1204
1205 if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1206 do
1207 rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1208 while (rc == -1 && errno == EINTR);
1209
1210 if (rc < 0)
1211 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1212 }
1213
1214 uv__metrics_inc_events(loop, nevents);
1215 if (uv__get_internal_fields(loop)->current_timeout == 0)
1216 uv__metrics_inc_events_waiting(loop, nevents);
1217 }
1218
1219
1220 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1221 * executed immediately, otherwise the file descriptor may have been closed
1222 * by the time the kernel starts the operation.
1223 */
uv__epoll_ctl_prep(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256],int op,int fd,struct epoll_event * e)1224 static void uv__epoll_ctl_prep(int epollfd,
1225 struct uv__iou* ctl,
1226 struct epoll_event (*events)[256],
1227 int op,
1228 int fd,
1229 struct epoll_event* e) {
1230 struct uv__io_uring_sqe* sqe;
1231 struct epoll_event* pe;
1232 uint32_t mask;
1233 uint32_t slot;
1234
1235 assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1236 assert(ctl->ringfd != -1);
1237
1238 mask = ctl->sqmask;
1239 slot = (*ctl->sqtail)++ & mask;
1240
1241 pe = &(*events)[slot];
1242 *pe = *e;
1243
1244 sqe = ctl->sqe;
1245 sqe = &sqe[slot];
1246
1247 memset(sqe, 0, sizeof(*sqe));
1248 sqe->addr = (uintptr_t) pe;
1249 sqe->fd = epollfd;
1250 sqe->len = op;
1251 sqe->off = fd;
1252 sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1253 sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1254
1255 if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1256 uv__epoll_ctl_flush(epollfd, ctl, events);
1257 }
1258
1259
uv__epoll_ctl_flush(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256])1260 static void uv__epoll_ctl_flush(int epollfd,
1261 struct uv__iou* ctl,
1262 struct epoll_event (*events)[256]) {
1263 struct epoll_event oldevents[256];
1264 struct uv__io_uring_cqe* cqe;
1265 uint32_t oldslot;
1266 uint32_t slot;
1267 uint32_t n;
1268 int fd;
1269 int op;
1270 int rc;
1271
1272 STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1273 assert(ctl->ringfd != -1);
1274 assert(*ctl->sqhead != *ctl->sqtail);
1275
1276 n = *ctl->sqtail - *ctl->sqhead;
1277 do
1278 rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1279 while (rc == -1 && errno == EINTR);
1280
1281 if (rc < 0)
1282 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1283
1284 if (rc != (int) n)
1285 abort();
1286
1287 assert(*ctl->sqhead == *ctl->sqtail);
1288
1289 memcpy(oldevents, *events, sizeof(*events));
1290
1291 /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1292 * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1293 * that we are already watching. Ignore the former and retry the latter
1294 * with EPOLL_CTL_MOD.
1295 */
1296 while (*ctl->cqhead != *ctl->cqtail) {
1297 slot = (*ctl->cqhead)++ & ctl->cqmask;
1298
1299 cqe = ctl->cqe;
1300 cqe = &cqe[slot];
1301
1302 if (cqe->res == 0)
1303 continue;
1304
1305 fd = cqe->user_data >> 32;
1306 op = 3 & cqe->user_data;
1307 oldslot = 255 & (cqe->user_data >> 2);
1308
1309 if (op == EPOLL_CTL_DEL)
1310 continue;
1311
1312 if (op != EPOLL_CTL_ADD)
1313 abort();
1314
1315 if (cqe->res != -EEXIST)
1316 abort();
1317
1318 uv__epoll_ctl_prep(epollfd,
1319 ctl,
1320 events,
1321 EPOLL_CTL_MOD,
1322 fd,
1323 &oldevents[oldslot]);
1324 }
1325 }
1326
1327
uv__io_poll(uv_loop_t * loop,int timeout)1328 void uv__io_poll(uv_loop_t* loop, int timeout) {
1329 uv__loop_internal_fields_t* lfields;
1330 struct epoll_event events[1024];
1331 struct epoll_event prep[256];
1332 struct uv__invalidate inv;
1333 struct epoll_event* pe;
1334 struct epoll_event e;
1335 struct uv__iou* ctl;
1336 struct uv__iou* iou;
1337 int real_timeout;
1338 struct uv__queue* q;
1339 uv__io_t* w;
1340 sigset_t* sigmask;
1341 sigset_t sigset;
1342 uint64_t base;
1343 int have_iou_events;
1344 int have_signals;
1345 int nevents;
1346 int epollfd;
1347 int count;
1348 int nfds;
1349 int fd;
1350 int op;
1351 int i;
1352 int user_timeout;
1353 int reset_timeout;
1354
1355 lfields = uv__get_internal_fields(loop);
1356 ctl = &lfields->ctl;
1357 iou = &lfields->iou;
1358
1359 sigmask = NULL;
1360 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1361 sigemptyset(&sigset);
1362 sigaddset(&sigset, SIGPROF);
1363 sigmask = &sigset;
1364 }
1365
1366 assert(timeout >= -1);
1367 base = loop->time;
1368 count = 48; /* Benchmarks suggest this gives the best throughput. */
1369 real_timeout = timeout;
1370
1371 if (lfields->flags & UV_METRICS_IDLE_TIME) {
1372 reset_timeout = 1;
1373 user_timeout = timeout;
1374 timeout = 0;
1375 } else {
1376 reset_timeout = 0;
1377 user_timeout = 0;
1378 }
1379
1380 epollfd = loop->backend_fd;
1381
1382 memset(&e, 0, sizeof(e));
1383
1384 while (!uv__queue_empty(&loop->watcher_queue)) {
1385 q = uv__queue_head(&loop->watcher_queue);
1386 w = uv__queue_data(q, uv__io_t, watcher_queue);
1387 uv__queue_remove(q);
1388 uv__queue_init(q);
1389
1390 op = EPOLL_CTL_MOD;
1391 if (w->events == 0)
1392 op = EPOLL_CTL_ADD;
1393
1394 w->events = w->pevents;
1395 e.events = w->pevents;
1396 if (w == &loop->async_io_watcher)
1397 /* Enable edge-triggered mode on async_io_watcher(eventfd),
1398 * so that we're able to eliminate the overhead of reading
1399 * the eventfd via system call on each event loop wakeup.
1400 */
1401 e.events |= EPOLLET;
1402 e.data.fd = w->fd;
1403 fd = w->fd;
1404
1405 if (ctl->ringfd != -1) {
1406 uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1407 continue;
1408 }
1409
1410 if (!epoll_ctl(epollfd, op, fd, &e))
1411 continue;
1412
1413 assert(op == EPOLL_CTL_ADD);
1414 assert(errno == EEXIST);
1415
1416 /* File descriptor that's been watched before, update event mask. */
1417 if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1418 abort();
1419 }
1420
1421 inv.events = events;
1422 inv.prep = &prep;
1423 inv.nfds = -1;
1424
1425 for (;;) {
1426 if (loop->nfds == 0)
1427 if (iou->in_flight == 0)
1428 break;
1429
1430 /* All event mask mutations should be visible to the kernel before
1431 * we enter epoll_pwait().
1432 */
1433 if (ctl->ringfd != -1)
1434 while (*ctl->sqhead != *ctl->sqtail)
1435 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1436
1437 /* Only need to set the provider_entry_time if timeout != 0. The function
1438 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
1439 */
1440 if (timeout != 0)
1441 uv__metrics_set_provider_entry_time(loop);
1442
1443 /* Store the current timeout in a location that's globally accessible so
1444 * other locations like uv__work_done() can determine whether the queue
1445 * of events in the callback were waiting when poll was called.
1446 */
1447 lfields->current_timeout = timeout;
1448
1449 nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1450
1451 /* Update loop->time unconditionally. It's tempting to skip the update when
1452 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
1453 * operating system didn't reschedule our process while in the syscall.
1454 */
1455 SAVE_ERRNO(uv__update_time(loop));
1456
1457 if (nfds == -1)
1458 assert(errno == EINTR);
1459 else if (nfds == 0)
1460 /* Unlimited timeout should only return with events or signal. */
1461 assert(timeout != -1);
1462
1463 if (nfds == 0 || nfds == -1) {
1464 if (reset_timeout != 0) {
1465 timeout = user_timeout;
1466 reset_timeout = 0;
1467 } else if (nfds == 0) {
1468 return;
1469 }
1470
1471 /* Interrupted by a signal. Update timeout and poll again. */
1472 goto update_timeout;
1473 }
1474
1475 have_iou_events = 0;
1476 have_signals = 0;
1477 nevents = 0;
1478
1479 inv.nfds = nfds;
1480 lfields->inv = &inv;
1481
1482 for (i = 0; i < nfds; i++) {
1483 pe = events + i;
1484 fd = pe->data.fd;
1485
1486 /* Skip invalidated events, see uv__platform_invalidate_fd */
1487 if (fd == -1)
1488 continue;
1489
1490 if (fd == iou->ringfd) {
1491 uv__poll_io_uring(loop, iou);
1492 have_iou_events = 1;
1493 continue;
1494 }
1495
1496 assert(fd >= 0);
1497 assert((unsigned) fd < loop->nwatchers);
1498
1499 w = loop->watchers[fd];
1500
1501 if (w == NULL) {
1502 /* File descriptor that we've stopped watching, disarm it.
1503 *
1504 * Ignore all errors because we may be racing with another thread
1505 * when the file descriptor is closed.
1506 *
1507 * Perform EPOLL_CTL_DEL immediately instead of going through
1508 * io_uring's submit queue, otherwise the file descriptor may
1509 * be closed by the time the kernel starts the operation.
1510 */
1511 epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1512 continue;
1513 }
1514
1515 /* Give users only events they're interested in. Prevents spurious
1516 * callbacks when previous callback invocation in this loop has stopped
1517 * the current watcher. Also, filters out events that users has not
1518 * requested us to watch.
1519 */
1520 pe->events &= w->pevents | POLLERR | POLLHUP;
1521
1522 /* Work around an epoll quirk where it sometimes reports just the
1523 * EPOLLERR or EPOLLHUP event. In order to force the event loop to
1524 * move forward, we merge in the read/write events that the watcher
1525 * is interested in; uv__read() and uv__write() will then deal with
1526 * the error or hangup in the usual fashion.
1527 *
1528 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1529 * reads the available data, calls uv_read_stop(), then sometime later
1530 * calls uv_read_start() again. By then, libuv has forgotten about the
1531 * hangup and the kernel won't report EPOLLIN again because there's
1532 * nothing left to read. If anything, libuv is to blame here. The
1533 * current hack is just a quick bandaid; to properly fix it, libuv
1534 * needs to remember the error/hangup event. We should get that for
1535 * free when we switch over to edge-triggered I/O.
1536 */
1537 if (pe->events == POLLERR || pe->events == POLLHUP)
1538 pe->events |=
1539 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1540
1541 if (pe->events != 0) {
1542 /* Run signal watchers last. This also affects child process watchers
1543 * because those are implemented in terms of signal watchers.
1544 */
1545 if (w == &loop->signal_io_watcher) {
1546 have_signals = 1;
1547 } else {
1548 uv__metrics_update_idle_time(loop);
1549 w->cb(loop, w, pe->events);
1550 }
1551
1552 nevents++;
1553 }
1554 }
1555
1556 uv__metrics_inc_events(loop, nevents);
1557 if (reset_timeout != 0) {
1558 timeout = user_timeout;
1559 reset_timeout = 0;
1560 uv__metrics_inc_events_waiting(loop, nevents);
1561 }
1562
1563 if (have_signals != 0) {
1564 uv__metrics_update_idle_time(loop);
1565 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
1566 }
1567
1568 lfields->inv = NULL;
1569
1570 if (have_iou_events != 0)
1571 break; /* Event loop should cycle now so don't poll again. */
1572
1573 if (have_signals != 0)
1574 break; /* Event loop should cycle now so don't poll again. */
1575
1576 if (nevents != 0) {
1577 if (nfds == ARRAY_SIZE(events) && --count != 0) {
1578 /* Poll for more events but don't block this time. */
1579 timeout = 0;
1580 continue;
1581 }
1582 break;
1583 }
1584
1585 update_timeout:
1586 if (timeout == 0)
1587 break;
1588
1589 if (timeout == -1)
1590 continue;
1591
1592 assert(timeout > 0);
1593
1594 real_timeout -= (loop->time - base);
1595 if (real_timeout <= 0)
1596 break;
1597
1598 timeout = real_timeout;
1599 }
1600
1601 if (ctl->ringfd != -1)
1602 while (*ctl->sqhead != *ctl->sqtail)
1603 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1604 }
1605
uv__hrtime(uv_clocktype_t type)1606 uint64_t uv__hrtime(uv_clocktype_t type) {
1607 static _Atomic clock_t fast_clock_id = -1;
1608 struct timespec t;
1609 clock_t clock_id;
1610
1611 /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1612 * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is
1613 * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1614 * decide to make a costly system call.
1615 */
1616 /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1617 * when it has microsecond granularity or better (unlikely).
1618 */
1619 clock_id = CLOCK_MONOTONIC;
1620 if (type != UV_CLOCK_FAST)
1621 goto done;
1622
1623 clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1624 if (clock_id != -1)
1625 goto done;
1626
1627 clock_id = CLOCK_MONOTONIC;
1628 if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1629 if (t.tv_nsec <= 1 * 1000 * 1000)
1630 clock_id = CLOCK_MONOTONIC_COARSE;
1631
1632 atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1633
1634 done:
1635
1636 if (clock_gettime(clock_id, &t))
1637 return 0; /* Not really possible. */
1638
1639 return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1640 }
1641
1642
uv_resident_set_memory(size_t * rss)1643 int uv_resident_set_memory(size_t* rss) {
1644 char buf[1024];
1645 const char* s;
1646 long val;
1647 int rc;
1648 int i;
1649
1650 /* rss: 24th element */
1651 rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1652 if (rc < 0)
1653 return rc;
1654
1655 /* find the last ')' */
1656 s = strrchr(buf, ')');
1657 if (s == NULL)
1658 goto err;
1659
1660 for (i = 1; i <= 22; i++) {
1661 s = strchr(s + 1, ' ');
1662 if (s == NULL)
1663 goto err;
1664 }
1665
1666 errno = 0;
1667 val = strtol(s, NULL, 10);
1668 if (val < 0 || errno != 0)
1669 goto err;
1670
1671 *rss = val * getpagesize();
1672 return 0;
1673
1674 err:
1675 return UV_EINVAL;
1676 }
1677
uv_uptime(double * uptime)1678 int uv_uptime(double* uptime) {
1679 struct timespec now;
1680 char buf[128];
1681
1682 /* Consult /proc/uptime when present (common case), or fall back to
1683 * clock_gettime. Why not always clock_gettime? It doesn't always return the
1684 * right result under OpenVZ and possibly other containerized environments.
1685 */
1686 if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1687 if (1 == sscanf(buf, "%lf", uptime))
1688 return 0;
1689
1690 if (clock_gettime(CLOCK_BOOTTIME, &now))
1691 return UV__ERR(errno);
1692
1693 *uptime = now.tv_sec;
1694 return 0;
1695 }
1696
1697
uv_cpu_info(uv_cpu_info_t ** ci,int * count)1698 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1699 #if defined(__PPC__)
1700 static const char model_marker[] = "cpu\t\t: ";
1701 #elif defined(__arm__)
1702 static const char model_marker[] = "Processor\t: ";
1703 #elif defined(__aarch64__)
1704 static const char model_marker[] = "CPU part\t: ";
1705 #elif defined(__mips__)
1706 static const char model_marker[] = "cpu model\t\t: ";
1707 #elif defined(__loongarch__)
1708 static const char model_marker[] = "cpu family\t\t: ";
1709 #else
1710 static const char model_marker[] = "model name\t: ";
1711 #endif
1712 static const char parts[] =
1713 #ifdef __aarch64__
1714 "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n"
1715 "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n"
1716 "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n"
1717 "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1718 "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n"
1719 "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n"
1720 "0xc0d\nCortex-A17\n" /* Originally A12 */
1721 "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n"
1722 "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n"
1723 "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n"
1724 "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n"
1725 "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n"
1726 "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n"
1727 "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n"
1728 "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n"
1729 "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n"
1730 "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n"
1731 "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n"
1732 #endif
1733 "";
1734 struct cpu {
1735 unsigned long long freq, user, nice, sys, idle, irq;
1736 unsigned model;
1737 };
1738 FILE* fp;
1739 char* p;
1740 int found;
1741 int n;
1742 unsigned i;
1743 unsigned cpu;
1744 unsigned maxcpu;
1745 unsigned size;
1746 unsigned long long skip;
1747 struct cpu (*cpus)[8192]; /* Kernel maximum. */
1748 struct cpu* c;
1749 struct cpu t;
1750 char (*model)[64];
1751 unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1752 /* Assumption: even big.LITTLE systems will have only a handful
1753 * of different CPU models. Most systems will just have one.
1754 */
1755 char models[8][64];
1756 char buf[1024];
1757
1758 memset(bitmap, 0, sizeof(bitmap));
1759 memset(models, 0, sizeof(models));
1760 snprintf(*models, sizeof(*models), "unknown");
1761 maxcpu = 0;
1762
1763 cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1764 if (cpus == NULL)
1765 return UV_ENOMEM;
1766
1767 fp = uv__open_file("/proc/stat");
1768 if (fp == NULL) {
1769 uv__free(cpus);
1770 return UV__ERR(errno);
1771 }
1772
1773 if (NULL == fgets(buf, sizeof(buf), fp))
1774 abort();
1775
1776 for (;;) {
1777 memset(&t, 0, sizeof(t));
1778
1779 n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1780 &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1781
1782 if (n != 7)
1783 break;
1784
1785 if (NULL == fgets(buf, sizeof(buf), fp))
1786 abort();
1787
1788 if (cpu >= ARRAY_SIZE(*cpus))
1789 continue;
1790
1791 (*cpus)[cpu] = t;
1792
1793 bitmap[cpu >> 3] |= 1 << (cpu & 7);
1794
1795 if (cpu >= maxcpu)
1796 maxcpu = cpu + 1;
1797 }
1798
1799 fclose(fp);
1800
1801 fp = uv__open_file("/proc/cpuinfo");
1802 if (fp == NULL)
1803 goto nocpuinfo;
1804
1805 for (;;) {
1806 if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1807 break; /* Parse error. */
1808
1809 found = 0;
1810 while (!found && fgets(buf, sizeof(buf), fp))
1811 found = !strncmp(buf, model_marker, sizeof(model_marker) - 1);
1812
1813 if (!found)
1814 goto next;
1815
1816 p = buf + sizeof(model_marker) - 1;
1817 n = (int) strcspn(p, "\n");
1818
1819 /* arm64: translate CPU part code to model name. */
1820 if (*parts) {
1821 p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1822 if (p == NULL)
1823 p = "unknown";
1824 else
1825 p += n + 1;
1826 n = (int) strcspn(p, "\n");
1827 }
1828
1829 found = 0;
1830 for (model = models; !found && model < ARRAY_END(models); model++)
1831 found = !strncmp(p, *model, strlen(*model));
1832
1833 if (!found)
1834 goto next;
1835
1836 if (**model == '\0')
1837 snprintf(*model, sizeof(*model), "%.*s", n, p);
1838
1839 if (cpu < maxcpu)
1840 (*cpus)[cpu].model = model - models;
1841
1842 next:
1843 while (fgets(buf, sizeof(buf), fp))
1844 if (*buf == '\n')
1845 break;
1846 }
1847
1848 fclose(fp);
1849 fp = NULL;
1850
1851 nocpuinfo:
1852
1853 n = 0;
1854 for (cpu = 0; cpu < maxcpu; cpu++) {
1855 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1856 continue;
1857
1858 n++;
1859 snprintf(buf, sizeof(buf),
1860 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1861
1862 fp = uv__open_file(buf);
1863 if (fp == NULL)
1864 continue;
1865
1866 if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1867 abort();
1868 fclose(fp);
1869 fp = NULL;
1870 }
1871
1872 size = n * sizeof(**ci) + sizeof(models);
1873 *ci = uv__malloc(size);
1874 *count = 0;
1875
1876 if (*ci == NULL) {
1877 uv__free(cpus);
1878 return UV_ENOMEM;
1879 }
1880
1881 *count = n;
1882 p = memcpy(*ci + n, models, sizeof(models));
1883
1884 i = 0;
1885 for (cpu = 0; cpu < maxcpu; cpu++) {
1886 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1887 continue;
1888
1889 c = *cpus + cpu;
1890
1891 (*ci)[i++] = (uv_cpu_info_t) {
1892 .model = p + c->model * sizeof(*model),
1893 .speed = c->freq / 1000,
1894 /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1895 * therefore the multiplier is always 1000/100 = 10.
1896 */
1897 .cpu_times = (struct uv_cpu_times_s) {
1898 .user = 10 * c->user,
1899 .nice = 10 * c->nice,
1900 .sys = 10 * c->sys,
1901 .idle = 10 * c->idle,
1902 .irq = 10 * c->irq,
1903 },
1904 };
1905 }
1906
1907 uv__free(cpus);
1908
1909 return 0;
1910 }
1911
1912
uv__ifaddr_exclude(struct ifaddrs * ent,int exclude_type)1913 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1914 if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1915 return 1;
1916 if (ent->ifa_addr == NULL)
1917 return 1;
1918 /*
1919 * On Linux getifaddrs returns information related to the raw underlying
1920 * devices. We're not interested in this information yet.
1921 */
1922 if (ent->ifa_addr->sa_family == PF_PACKET)
1923 return exclude_type;
1924 return !exclude_type;
1925 }
1926
uv_interface_addresses(uv_interface_address_t ** addresses,int * count)1927 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1928 struct ifaddrs *addrs, *ent;
1929 uv_interface_address_t* address;
1930 int i;
1931 struct sockaddr_ll *sll;
1932
1933 *count = 0;
1934 *addresses = NULL;
1935
1936 if (getifaddrs(&addrs))
1937 return UV__ERR(errno);
1938
1939 /* Count the number of interfaces */
1940 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1941 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1942 continue;
1943
1944 (*count)++;
1945 }
1946
1947 if (*count == 0) {
1948 freeifaddrs(addrs);
1949 return 0;
1950 }
1951
1952 /* Make sure the memory is initiallized to zero using calloc() */
1953 *addresses = uv__calloc(*count, sizeof(**addresses));
1954 if (!(*addresses)) {
1955 freeifaddrs(addrs);
1956 return UV_ENOMEM;
1957 }
1958
1959 address = *addresses;
1960
1961 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1962 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1963 continue;
1964
1965 address->name = uv__strdup(ent->ifa_name);
1966
1967 if (ent->ifa_addr->sa_family == AF_INET6) {
1968 address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
1969 } else {
1970 address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
1971 }
1972
1973 if (ent->ifa_netmask->sa_family == AF_INET6) {
1974 address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
1975 } else {
1976 address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
1977 }
1978
1979 address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
1980
1981 address++;
1982 }
1983
1984 /* Fill in physical addresses for each interface */
1985 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1986 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
1987 continue;
1988
1989 address = *addresses;
1990
1991 for (i = 0; i < (*count); i++) {
1992 size_t namelen = strlen(ent->ifa_name);
1993 /* Alias interface share the same physical address */
1994 if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
1995 (address->name[namelen] == 0 || address->name[namelen] == ':')) {
1996 sll = (struct sockaddr_ll*)ent->ifa_addr;
1997 memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
1998 }
1999 address++;
2000 }
2001 }
2002
2003 freeifaddrs(addrs);
2004
2005 return 0;
2006 }
2007
2008
uv_free_interface_addresses(uv_interface_address_t * addresses,int count)2009 void uv_free_interface_addresses(uv_interface_address_t* addresses,
2010 int count) {
2011 int i;
2012
2013 for (i = 0; i < count; i++) {
2014 uv__free(addresses[i].name);
2015 }
2016
2017 uv__free(addresses);
2018 }
2019
2020
uv__set_process_title(const char * title)2021 void uv__set_process_title(const char* title) {
2022 #if defined(PR_SET_NAME)
2023 prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */
2024 #endif
2025 }
2026
2027
uv__read_proc_meminfo(const char * what)2028 static uint64_t uv__read_proc_meminfo(const char* what) {
2029 uint64_t rc;
2030 char* p;
2031 char buf[4096]; /* Large enough to hold all of /proc/meminfo. */
2032
2033 if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2034 return 0;
2035
2036 p = strstr(buf, what);
2037
2038 if (p == NULL)
2039 return 0;
2040
2041 p += strlen(what);
2042
2043 rc = 0;
2044 sscanf(p, "%" PRIu64 " kB", &rc);
2045
2046 return rc * 1024;
2047 }
2048
2049
uv_get_free_memory(void)2050 uint64_t uv_get_free_memory(void) {
2051 struct sysinfo info;
2052 uint64_t rc;
2053
2054 rc = uv__read_proc_meminfo("MemAvailable:");
2055
2056 if (rc != 0)
2057 return rc;
2058
2059 if (0 == sysinfo(&info))
2060 return (uint64_t) info.freeram * info.mem_unit;
2061
2062 return 0;
2063 }
2064
2065
uv_get_total_memory(void)2066 uint64_t uv_get_total_memory(void) {
2067 struct sysinfo info;
2068 uint64_t rc;
2069
2070 rc = uv__read_proc_meminfo("MemTotal:");
2071
2072 if (rc != 0)
2073 return rc;
2074
2075 if (0 == sysinfo(&info))
2076 return (uint64_t) info.totalram * info.mem_unit;
2077
2078 return 0;
2079 }
2080
2081
uv__read_uint64(const char * filename)2082 static uint64_t uv__read_uint64(const char* filename) {
2083 char buf[32]; /* Large enough to hold an encoded uint64_t. */
2084 uint64_t rc;
2085
2086 rc = 0;
2087 if (0 == uv__slurp(filename, buf, sizeof(buf)))
2088 if (1 != sscanf(buf, "%" PRIu64, &rc))
2089 if (0 == strcmp(buf, "max\n"))
2090 rc = UINT64_MAX;
2091
2092 return rc;
2093 }
2094
2095
2096 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2097 * finds the location and length of the memory controller mount path.
2098 * This disregards the leading / for easy concatenation of paths.
2099 * Returns NULL if the memory controller wasn't found. */
uv__cgroup1_find_memory_controller(char buf[static1024],int * n)2100 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2101 int* n) {
2102 char* p;
2103
2104 /* Seek to the memory controller line. */
2105 p = strchr(buf, ':');
2106 while (p != NULL && strncmp(p, ":memory:", 8)) {
2107 p = strchr(p, '\n');
2108 if (p != NULL)
2109 p = strchr(p, ':');
2110 }
2111
2112 if (p != NULL) {
2113 /* Determine the length of the mount path. */
2114 p = p + strlen(":memory:/");
2115 *n = (int) strcspn(p, "\n");
2116 }
2117
2118 return p;
2119 }
2120
uv__get_cgroup1_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2121 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2122 uint64_t* max) {
2123 char filename[4097];
2124 char* p;
2125 int n;
2126 uint64_t cgroup1_max;
2127
2128 /* Find out where the controller is mounted. */
2129 p = uv__cgroup1_find_memory_controller(buf, &n);
2130 if (p != NULL) {
2131 snprintf(filename, sizeof(filename),
2132 "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2133 *high = uv__read_uint64(filename);
2134
2135 snprintf(filename, sizeof(filename),
2136 "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2137 *max = uv__read_uint64(filename);
2138
2139 /* If the controller wasn't mounted, the reads above will have failed,
2140 * as indicated by uv__read_uint64 returning 0.
2141 */
2142 if (*high != 0 && *max != 0)
2143 goto update_limits;
2144 }
2145
2146 /* Fall back to the limits of the global memory controller. */
2147 *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2148 *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2149
2150 /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2151 * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2152 */
2153 update_limits:
2154 cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2155 if (*high == cgroup1_max)
2156 *high = UINT64_MAX;
2157 if (*max == cgroup1_max)
2158 *max = UINT64_MAX;
2159 }
2160
uv__get_cgroup2_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2161 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2162 uint64_t* max) {
2163 char filename[4097];
2164 char* p;
2165 int n;
2166
2167 /* Find out where the controller is mounted. */
2168 p = buf + strlen("0::/");
2169 n = (int) strcspn(p, "\n");
2170
2171 /* Read the memory limits of the controller. */
2172 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2173 *max = uv__read_uint64(filename);
2174 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2175 *high = uv__read_uint64(filename);
2176 }
2177
uv__get_cgroup_constrained_memory(char buf[static1024])2178 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2179 uint64_t high;
2180 uint64_t max;
2181
2182 /* In the case of cgroupv2, we'll only have a single entry. */
2183 if (strncmp(buf, "0::/", 4))
2184 uv__get_cgroup1_memory_limits(buf, &high, &max);
2185 else
2186 uv__get_cgroup2_memory_limits(buf, &high, &max);
2187
2188 if (high == 0 || max == 0)
2189 return 0;
2190
2191 return high < max ? high : max;
2192 }
2193
uv_get_constrained_memory(void)2194 uint64_t uv_get_constrained_memory(void) {
2195 char buf[1024];
2196
2197 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2198 return 0;
2199
2200 return uv__get_cgroup_constrained_memory(buf);
2201 }
2202
2203
uv__get_cgroup1_current_memory(char buf[static1024])2204 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2205 char filename[4097];
2206 uint64_t current;
2207 char* p;
2208 int n;
2209
2210 /* Find out where the controller is mounted. */
2211 p = uv__cgroup1_find_memory_controller(buf, &n);
2212 if (p != NULL) {
2213 snprintf(filename, sizeof(filename),
2214 "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2215 current = uv__read_uint64(filename);
2216
2217 /* If the controller wasn't mounted, the reads above will have failed,
2218 * as indicated by uv__read_uint64 returning 0.
2219 */
2220 if (current != 0)
2221 return current;
2222 }
2223
2224 /* Fall back to the usage of the global memory controller. */
2225 return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2226 }
2227
uv__get_cgroup2_current_memory(char buf[static1024])2228 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2229 char filename[4097];
2230 char* p;
2231 int n;
2232
2233 /* Find out where the controller is mounted. */
2234 p = buf + strlen("0::/");
2235 n = (int) strcspn(p, "\n");
2236
2237 snprintf(filename, sizeof(filename),
2238 "/sys/fs/cgroup/%.*s/memory.current", n, p);
2239 return uv__read_uint64(filename);
2240 }
2241
uv_get_available_memory(void)2242 uint64_t uv_get_available_memory(void) {
2243 char buf[1024];
2244 uint64_t constrained;
2245 uint64_t current;
2246 uint64_t total;
2247
2248 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2249 return 0;
2250
2251 constrained = uv__get_cgroup_constrained_memory(buf);
2252 if (constrained == 0)
2253 return uv_get_free_memory();
2254
2255 total = uv_get_total_memory();
2256 if (constrained > total)
2257 return uv_get_free_memory();
2258
2259 /* In the case of cgroupv2, we'll only have a single entry. */
2260 if (strncmp(buf, "0::/", 4))
2261 current = uv__get_cgroup1_current_memory(buf);
2262 else
2263 current = uv__get_cgroup2_current_memory(buf);
2264
2265 /* memory usage can be higher than the limit (for short bursts of time) */
2266 if (constrained < current)
2267 return 0;
2268
2269 return constrained - current;
2270 }
2271
2272
uv__get_cgroupv2_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2273 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2274 uv__cpu_constraint* constraint) {
2275 char path[256];
2276 char buf[1024];
2277 unsigned int weight;
2278 int cgroup_size;
2279 const char* cgroup_trimmed;
2280 char quota_buf[16];
2281
2282 if (strncmp(cgroup, "0::/", 4) != 0)
2283 return UV_EINVAL;
2284
2285 /* Trim ending \n by replacing it with a 0 */
2286 cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */
2287 cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first slash */
2288
2289 /* Construct the path to the cpu.max file */
2290 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.max", cgroup_size,
2291 cgroup_trimmed);
2292
2293 /* Read cpu.max */
2294 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2295 return UV_EIO;
2296
2297 if (sscanf(buf, "%15s %llu", quota_buf, &constraint->period_length) != 2)
2298 return UV_EINVAL;
2299
2300 if (strncmp(quota_buf, "max", 3) == 0)
2301 constraint->quota_per_period = LLONG_MAX;
2302 else if (sscanf(quota_buf, "%lld", &constraint->quota_per_period) != 1)
2303 return UV_EINVAL; // conversion failed
2304
2305 /* Construct the path to the cpu.weight file */
2306 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.weight", cgroup_size,
2307 cgroup_trimmed);
2308
2309 /* Read cpu.weight */
2310 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2311 return UV_EIO;
2312
2313 if (sscanf(buf, "%u", &weight) != 1)
2314 return UV_EINVAL;
2315
2316 constraint->proportions = (double)weight / 100.0;
2317
2318 return 0;
2319 }
2320
uv__cgroup1_find_cpu_controller(const char * cgroup,int * cgroup_size)2321 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2322 int* cgroup_size) {
2323 /* Seek to the cpu controller line. */
2324 char* cgroup_cpu = strstr(cgroup, ":cpu,");
2325
2326 if (cgroup_cpu != NULL) {
2327 /* Skip the controller prefix to the start of the cgroup path. */
2328 cgroup_cpu += sizeof(":cpu,") - 1;
2329 /* Determine the length of the cgroup path, excluding the newline. */
2330 *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2331 }
2332
2333 return cgroup_cpu;
2334 }
2335
uv__get_cgroupv1_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2336 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2337 uv__cpu_constraint* constraint) {
2338 char path[256];
2339 char buf[1024];
2340 unsigned int shares;
2341 int cgroup_size;
2342 char* cgroup_cpu;
2343
2344 cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2345
2346 if (cgroup_cpu == NULL)
2347 return UV_EIO;
2348
2349 /* Construct the path to the cpu.cfs_quota_us file */
2350 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2351 cgroup_size, cgroup_cpu);
2352
2353 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2354 return UV_EIO;
2355
2356 if (sscanf(buf, "%lld", &constraint->quota_per_period) != 1)
2357 return UV_EINVAL;
2358
2359 /* Construct the path to the cpu.cfs_period_us file */
2360 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2361 cgroup_size, cgroup_cpu);
2362
2363 /* Read cpu.cfs_period_us */
2364 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2365 return UV_EIO;
2366
2367 if (sscanf(buf, "%lld", &constraint->period_length) != 1)
2368 return UV_EINVAL;
2369
2370 /* Construct the path to the cpu.shares file */
2371 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.shares", cgroup_size,
2372 cgroup_cpu);
2373
2374 /* Read cpu.shares */
2375 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2376 return UV_EIO;
2377
2378 if (sscanf(buf, "%u", &shares) != 1)
2379 return UV_EINVAL;
2380
2381 constraint->proportions = (double)shares / 1024.0;
2382
2383 return 0;
2384 }
2385
uv__get_constrained_cpu(uv__cpu_constraint * constraint)2386 int uv__get_constrained_cpu(uv__cpu_constraint* constraint) {
2387 char cgroup[1024];
2388
2389 /* Read the cgroup from /proc/self/cgroup */
2390 if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2391 return UV_EIO;
2392
2393 /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2394 * The entry for cgroup v2 is always in the format "0::$PATH"
2395 * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2396 if (strncmp(cgroup, "0::/", 4) == 0)
2397 return uv__get_cgroupv2_constrained_cpu(cgroup, constraint);
2398 else
2399 return uv__get_cgroupv1_constrained_cpu(cgroup, constraint);
2400 }
2401
2402
uv_loadavg(double avg[3])2403 void uv_loadavg(double avg[3]) {
2404 struct sysinfo info;
2405 char buf[128]; /* Large enough to hold all of /proc/loadavg. */
2406
2407 if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2408 if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2409 return;
2410
2411 if (sysinfo(&info) < 0)
2412 return;
2413
2414 avg[0] = (double) info.loads[0] / 65536.0;
2415 avg[1] = (double) info.loads[1] / 65536.0;
2416 avg[2] = (double) info.loads[2] / 65536.0;
2417 }
2418
2419
compare_watchers(const struct watcher_list * a,const struct watcher_list * b)2420 static int compare_watchers(const struct watcher_list* a,
2421 const struct watcher_list* b) {
2422 if (a->wd < b->wd) return -1;
2423 if (a->wd > b->wd) return 1;
2424 return 0;
2425 }
2426
2427
init_inotify(uv_loop_t * loop)2428 static int init_inotify(uv_loop_t* loop) {
2429 int fd;
2430
2431 if (loop->inotify_fd != -1)
2432 return 0;
2433
2434 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2435 if (fd < 0)
2436 return UV__ERR(errno);
2437
2438 loop->inotify_fd = fd;
2439 uv__io_init(&loop->inotify_read_watcher, uv__inotify_read, loop->inotify_fd);
2440 uv__io_start(loop, &loop->inotify_read_watcher, POLLIN);
2441
2442 return 0;
2443 }
2444
2445
uv__inotify_fork(uv_loop_t * loop,struct watcher_list * root)2446 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2447 /* Open the inotify_fd, and re-arm all the inotify watchers. */
2448 int err;
2449 struct watcher_list* tmp_watcher_list_iter;
2450 struct watcher_list* watcher_list;
2451 struct watcher_list tmp_watcher_list;
2452 struct uv__queue queue;
2453 struct uv__queue* q;
2454 uv_fs_event_t* handle;
2455 char* tmp_path;
2456
2457 if (root == NULL)
2458 return 0;
2459
2460 /* We must restore the old watcher list to be able to close items
2461 * out of it.
2462 */
2463 loop->inotify_watchers = root;
2464
2465 uv__queue_init(&tmp_watcher_list.watchers);
2466 /* Note that the queue we use is shared with the start and stop()
2467 * functions, making uv__queue_foreach unsafe to use. So we use the
2468 * uv__queue_move trick to safely iterate. Also don't free the watcher
2469 * list until we're done iterating. c.f. uv__inotify_read.
2470 */
2471 RB_FOREACH_SAFE(watcher_list, watcher_root,
2472 uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2473 watcher_list->iterating = 1;
2474 uv__queue_move(&watcher_list->watchers, &queue);
2475 while (!uv__queue_empty(&queue)) {
2476 q = uv__queue_head(&queue);
2477 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2478 /* It's critical to keep a copy of path here, because it
2479 * will be set to NULL by stop() and then deallocated by
2480 * maybe_free_watcher_list
2481 */
2482 tmp_path = uv__strdup(handle->path);
2483 assert(tmp_path != NULL);
2484 uv__queue_remove(q);
2485 uv__queue_insert_tail(&watcher_list->watchers, q);
2486 uv_fs_event_stop(handle);
2487
2488 uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2489 handle->path = tmp_path;
2490 }
2491 watcher_list->iterating = 0;
2492 maybe_free_watcher_list(watcher_list, loop);
2493 }
2494
2495 uv__queue_move(&tmp_watcher_list.watchers, &queue);
2496 while (!uv__queue_empty(&queue)) {
2497 q = uv__queue_head(&queue);
2498 uv__queue_remove(q);
2499 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2500 tmp_path = handle->path;
2501 handle->path = NULL;
2502 err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2503 uv__free(tmp_path);
2504 if (err)
2505 return err;
2506 }
2507
2508 return 0;
2509 }
2510
2511
find_watcher(uv_loop_t * loop,int wd)2512 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2513 struct watcher_list w;
2514 w.wd = wd;
2515 return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2516 }
2517
2518
maybe_free_watcher_list(struct watcher_list * w,uv_loop_t * loop)2519 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2520 /* if the watcher_list->watchers is being iterated over, we can't free it. */
2521 if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2522 /* No watchers left for this path. Clean up. */
2523 RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2524 inotify_rm_watch(loop->inotify_fd, w->wd);
2525 uv__free(w);
2526 }
2527 }
2528
2529
uv__inotify_read(uv_loop_t * loop,uv__io_t * dummy,unsigned int events)2530 static void uv__inotify_read(uv_loop_t* loop,
2531 uv__io_t* dummy,
2532 unsigned int events) {
2533 const struct inotify_event* e;
2534 struct watcher_list* w;
2535 uv_fs_event_t* h;
2536 struct uv__queue queue;
2537 struct uv__queue* q;
2538 const char* path;
2539 ssize_t size;
2540 const char *p;
2541 /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2542 char buf[4096];
2543
2544 for (;;) {
2545 do
2546 size = read(loop->inotify_fd, buf, sizeof(buf));
2547 while (size == -1 && errno == EINTR);
2548
2549 if (size == -1) {
2550 assert(errno == EAGAIN || errno == EWOULDBLOCK);
2551 break;
2552 }
2553
2554 assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2555
2556 /* Now we have one or more inotify_event structs. */
2557 for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2558 e = (const struct inotify_event*) p;
2559
2560 events = 0;
2561 if (e->mask & (IN_ATTRIB|IN_MODIFY))
2562 events |= UV_CHANGE;
2563 if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2564 events |= UV_RENAME;
2565
2566 w = find_watcher(loop, e->wd);
2567 if (w == NULL)
2568 continue; /* Stale event, no watchers left. */
2569
2570 /* inotify does not return the filename when monitoring a single file
2571 * for modifications. Repurpose the filename for API compatibility.
2572 * I'm not convinced this is a good thing, maybe it should go.
2573 */
2574 path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2575
2576 /* We're about to iterate over the queue and call user's callbacks.
2577 * What can go wrong?
2578 * A callback could call uv_fs_event_stop()
2579 * and the queue can change under our feet.
2580 * So, we use uv__queue_move() trick to safely iterate over the queue.
2581 * And we don't free the watcher_list until we're done iterating.
2582 *
2583 * First,
2584 * tell uv_fs_event_stop() (that could be called from a user's callback)
2585 * not to free watcher_list.
2586 */
2587 w->iterating = 1;
2588 uv__queue_move(&w->watchers, &queue);
2589 while (!uv__queue_empty(&queue)) {
2590 q = uv__queue_head(&queue);
2591 h = uv__queue_data(q, uv_fs_event_t, watchers);
2592
2593 uv__queue_remove(q);
2594 uv__queue_insert_tail(&w->watchers, q);
2595
2596 h->cb(h, path, events, 0);
2597 }
2598 /* done iterating, time to (maybe) free empty watcher_list */
2599 w->iterating = 0;
2600 maybe_free_watcher_list(w, loop);
2601 }
2602 }
2603 }
2604
2605
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)2606 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2607 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2608 return 0;
2609 }
2610
2611
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)2612 int uv_fs_event_start(uv_fs_event_t* handle,
2613 uv_fs_event_cb cb,
2614 const char* path,
2615 unsigned int flags) {
2616 struct watcher_list* w;
2617 uv_loop_t* loop;
2618 size_t len;
2619 int events;
2620 int err;
2621 int wd;
2622
2623 if (uv__is_active(handle))
2624 return UV_EINVAL;
2625
2626 loop = handle->loop;
2627
2628 err = init_inotify(loop);
2629 if (err)
2630 return err;
2631
2632 events = IN_ATTRIB
2633 | IN_CREATE
2634 | IN_MODIFY
2635 | IN_DELETE
2636 | IN_DELETE_SELF
2637 | IN_MOVE_SELF
2638 | IN_MOVED_FROM
2639 | IN_MOVED_TO;
2640
2641 wd = inotify_add_watch(loop->inotify_fd, path, events);
2642 if (wd == -1)
2643 return UV__ERR(errno);
2644
2645 w = find_watcher(loop, wd);
2646 if (w)
2647 goto no_insert;
2648
2649 len = strlen(path) + 1;
2650 w = uv__malloc(sizeof(*w) + len);
2651 if (w == NULL)
2652 return UV_ENOMEM;
2653
2654 w->wd = wd;
2655 w->path = memcpy(w + 1, path, len);
2656 uv__queue_init(&w->watchers);
2657 w->iterating = 0;
2658 RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2659
2660 no_insert:
2661 uv__handle_start(handle);
2662 uv__queue_insert_tail(&w->watchers, &handle->watchers);
2663 handle->path = w->path;
2664 handle->cb = cb;
2665 handle->wd = wd;
2666
2667 return 0;
2668 }
2669
2670
uv_fs_event_stop(uv_fs_event_t * handle)2671 int uv_fs_event_stop(uv_fs_event_t* handle) {
2672 struct watcher_list* w;
2673
2674 if (!uv__is_active(handle))
2675 return 0;
2676
2677 w = find_watcher(handle->loop, handle->wd);
2678 assert(w != NULL);
2679
2680 handle->wd = -1;
2681 handle->path = NULL;
2682 uv__handle_stop(handle);
2683 uv__queue_remove(&handle->watchers);
2684
2685 maybe_free_watcher_list(w, handle->loop);
2686
2687 return 0;
2688 }
2689
2690
uv__fs_event_close(uv_fs_event_t * handle)2691 void uv__fs_event_close(uv_fs_event_t* handle) {
2692 uv_fs_event_stop(handle);
2693 }
2694