1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2 * Permission is hereby granted, free of charge, to any person obtaining a copy
3 * of this software and associated documentation files (the "Software"), to
4 * deal in the Software without restriction, including without limitation the
5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6 * sell copies of the Software, and to permit persons to whom the Software is
7 * furnished to do so, subject to the following conditions:
8 *
9 * The above copyright notice and this permission notice shall be included in
10 * all copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18 * IN THE SOFTWARE.
19 */
20
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22 * EPOLL* counterparts. We use the POLL* variants in this file because that
23 * is what libuv uses elsewhere.
24 */
25
26 #include "uv.h"
27 #include "internal.h"
28
29 #include <inttypes.h>
30 #include <stdatomic.h>
31 #include <stddef.h> /* offsetof */
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <errno.h>
38
39 #include <fcntl.h>
40 #include <ifaddrs.h>
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <netpacket/packet.h>
44 #include <sys/epoll.h>
45 #include <sys/inotify.h>
46 #include <sys/mman.h>
47 #include <sys/param.h>
48 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/stat.h>
51 #include <sys/syscall.h>
52 #include <sys/sysinfo.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/utsname.h>
56 #include <time.h>
57 #include <unistd.h>
58
59 #ifndef __NR_io_uring_setup
60 # define __NR_io_uring_setup 425
61 #endif
62
63 #ifndef __NR_io_uring_enter
64 # define __NR_io_uring_enter 426
65 #endif
66
67 #ifndef __NR_io_uring_register
68 # define __NR_io_uring_register 427
69 #endif
70
71 #ifndef __NR_copy_file_range
72 # if defined(__x86_64__)
73 # define __NR_copy_file_range 326
74 # elif defined(__i386__)
75 # define __NR_copy_file_range 377
76 # elif defined(__s390__)
77 # define __NR_copy_file_range 375
78 # elif defined(__arm__)
79 # define __NR_copy_file_range 391
80 # elif defined(__aarch64__)
81 # define __NR_copy_file_range 285
82 # elif defined(__powerpc__)
83 # define __NR_copy_file_range 379
84 # elif defined(__arc__)
85 # define __NR_copy_file_range 285
86 # elif defined(__riscv)
87 # define __NR_copy_file_range 285
88 # endif
89 #endif /* __NR_copy_file_range */
90
91 #ifndef __NR_statx
92 # if defined(__x86_64__)
93 # define __NR_statx 332
94 # elif defined(__i386__)
95 # define __NR_statx 383
96 # elif defined(__aarch64__)
97 # define __NR_statx 397
98 # elif defined(__arm__)
99 # define __NR_statx 397
100 # elif defined(__ppc__)
101 # define __NR_statx 383
102 # elif defined(__s390__)
103 # define __NR_statx 379
104 # elif defined(__riscv)
105 # define __NR_statx 291
106 # endif
107 #endif /* __NR_statx */
108
109 #ifndef __NR_getrandom
110 # if defined(__x86_64__)
111 # define __NR_getrandom 318
112 # elif defined(__i386__)
113 # define __NR_getrandom 355
114 # elif defined(__aarch64__)
115 # define __NR_getrandom 384
116 # elif defined(__arm__)
117 # define __NR_getrandom 384
118 # elif defined(__ppc__)
119 # define __NR_getrandom 359
120 # elif defined(__s390__)
121 # define __NR_getrandom 349
122 # elif defined(__riscv)
123 # define __NR_getrandom 278
124 # endif
125 #endif /* __NR_getrandom */
126
127 enum {
128 UV__IORING_SETUP_SQPOLL = 2u,
129 };
130
131 enum {
132 UV__IORING_FEAT_SINGLE_MMAP = 1u,
133 UV__IORING_FEAT_NODROP = 2u,
134 UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */
135 };
136
137 enum {
138 UV__IORING_OP_READV = 1,
139 UV__IORING_OP_WRITEV = 2,
140 UV__IORING_OP_FSYNC = 3,
141 UV__IORING_OP_OPENAT = 18,
142 UV__IORING_OP_CLOSE = 19,
143 UV__IORING_OP_STATX = 21,
144 UV__IORING_OP_EPOLL_CTL = 29,
145 UV__IORING_OP_RENAMEAT = 35,
146 UV__IORING_OP_UNLINKAT = 36,
147 UV__IORING_OP_MKDIRAT = 37,
148 UV__IORING_OP_SYMLINKAT = 38,
149 UV__IORING_OP_LINKAT = 39,
150 };
151
152 enum {
153 UV__IORING_ENTER_GETEVENTS = 1u,
154 UV__IORING_ENTER_SQ_WAKEUP = 2u,
155 };
156
157 enum {
158 UV__IORING_SQ_NEED_WAKEUP = 1u,
159 UV__IORING_SQ_CQ_OVERFLOW = 2u,
160 };
161
162 enum {
163 UV__MKDIRAT_SYMLINKAT_LINKAT = 1u,
164 };
165
166 struct uv__io_cqring_offsets {
167 uint32_t head;
168 uint32_t tail;
169 uint32_t ring_mask;
170 uint32_t ring_entries;
171 uint32_t overflow;
172 uint32_t cqes;
173 uint64_t reserved0;
174 uint64_t reserved1;
175 };
176
177 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
178
179 struct uv__io_sqring_offsets {
180 uint32_t head;
181 uint32_t tail;
182 uint32_t ring_mask;
183 uint32_t ring_entries;
184 uint32_t flags;
185 uint32_t dropped;
186 uint32_t array;
187 uint32_t reserved0;
188 uint64_t reserved1;
189 };
190
191 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
192
193 struct uv__io_uring_cqe {
194 uint64_t user_data;
195 int32_t res;
196 uint32_t flags;
197 };
198
199 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
200
201 struct uv__io_uring_sqe {
202 uint8_t opcode;
203 uint8_t flags;
204 uint16_t ioprio;
205 int32_t fd;
206 union {
207 uint64_t off;
208 uint64_t addr2;
209 };
210 union {
211 uint64_t addr;
212 };
213 uint32_t len;
214 union {
215 uint32_t rw_flags;
216 uint32_t fsync_flags;
217 uint32_t open_flags;
218 uint32_t statx_flags;
219 };
220 uint64_t user_data;
221 union {
222 uint16_t buf_index;
223 uint64_t pad[3];
224 };
225 };
226
227 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
228 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
229 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
230 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
231 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
232 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
233 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
234 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
235 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
236 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
237 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
238
239 struct uv__io_uring_params {
240 uint32_t sq_entries;
241 uint32_t cq_entries;
242 uint32_t flags;
243 uint32_t sq_thread_cpu;
244 uint32_t sq_thread_idle;
245 uint32_t features;
246 uint32_t reserved[4];
247 struct uv__io_sqring_offsets sq_off; /* 40 bytes */
248 struct uv__io_cqring_offsets cq_off; /* 40 bytes */
249 };
250
251 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
252 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
253 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
254
255 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
256 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
257 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
258
259 struct watcher_list {
260 RB_ENTRY(watcher_list) entry;
261 struct uv__queue watchers;
262 int iterating;
263 char* path;
264 int wd;
265 };
266
267 struct watcher_root {
268 struct watcher_list* rbh_root;
269 };
270
271 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
272 static void uv__inotify_read(uv_loop_t* loop,
273 uv__io_t* w,
274 unsigned int revents);
275 static int compare_watchers(const struct watcher_list* a,
276 const struct watcher_list* b);
277 static void maybe_free_watcher_list(struct watcher_list* w,
278 uv_loop_t* loop);
279
280 static void uv__epoll_ctl_flush(int epollfd,
281 struct uv__iou* ctl,
282 struct epoll_event (*events)[256]);
283
284 static void uv__epoll_ctl_prep(int epollfd,
285 struct uv__iou* ctl,
286 struct epoll_event (*events)[256],
287 int op,
288 int fd,
289 struct epoll_event* e);
290
RB_GENERATE_STATIC(watcher_root,watcher_list,entry,compare_watchers)291 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
292
293
294 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
295 /* This cast works because watcher_root is a struct with a pointer as its
296 * sole member. Such type punning is unsafe in the presence of strict
297 * pointer aliasing (and is just plain nasty) but that is why libuv
298 * is compiled with -fno-strict-aliasing.
299 */
300 return (struct watcher_root*) &loop->inotify_watchers;
301 }
302
303
uv__kernel_version(void)304 unsigned uv__kernel_version(void) {
305 static _Atomic unsigned cached_version;
306 struct utsname u;
307 unsigned version;
308 unsigned major;
309 unsigned minor;
310 unsigned patch;
311 char v_sig[256];
312 char* needle;
313
314 version = atomic_load_explicit(&cached_version, memory_order_relaxed);
315 if (version != 0)
316 return version;
317
318 /* Check /proc/version_signature first as it's the way to get the mainline
319 * kernel version in Ubuntu. The format is:
320 * Ubuntu ubuntu_kernel_version mainline_kernel_version
321 * For example:
322 * Ubuntu 5.15.0-79.86-generic 5.15.111
323 */
324 if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
325 if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
326 goto calculate_version;
327
328 if (-1 == uname(&u))
329 return 0;
330
331 /* In Debian we need to check `version` instead of `release` to extract the
332 * mainline kernel version. This is an example of how it looks like:
333 * #1 SMP Debian 5.10.46-4 (2021-08-03)
334 */
335 needle = strstr(u.version, "Debian ");
336 if (needle != NULL)
337 if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
338 goto calculate_version;
339
340 if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
341 return 0;
342
343 /* Handle it when the process runs under the UNAME26 personality:
344 *
345 * - kernels >= 3.x identify as 2.6.40+x
346 * - kernels >= 4.x identify as 2.6.60+x
347 *
348 * UNAME26 is a poorly conceived hack that doesn't let us distinguish
349 * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
350 * that 2.6.60+x means 4.x.
351 *
352 * Fun fact of the day: it's technically possible to observe the actual
353 * kernel version for a brief moment because uname() first copies out the
354 * real release string before overwriting it with the backcompat string.
355 */
356 if (major == 2 && minor == 6) {
357 if (patch >= 60) {
358 major = 4;
359 minor = patch - 60;
360 patch = 0;
361 } else if (patch >= 40) {
362 major = 3;
363 minor = patch - 40;
364 patch = 0;
365 }
366 }
367
368 calculate_version:
369 version = major * 65536 + minor * 256 + patch;
370 atomic_store_explicit(&cached_version, version, memory_order_relaxed);
371
372 return version;
373 }
374
375
376 ssize_t
uv__fs_copy_file_range(int fd_in,off_t * off_in,int fd_out,off_t * off_out,size_t len,unsigned int flags)377 uv__fs_copy_file_range(int fd_in,
378 off_t* off_in,
379 int fd_out,
380 off_t* off_out,
381 size_t len,
382 unsigned int flags)
383 {
384 #ifdef __NR_copy_file_range
385 return syscall(__NR_copy_file_range,
386 fd_in,
387 off_in,
388 fd_out,
389 off_out,
390 len,
391 flags);
392 #else
393 return errno = ENOSYS, -1;
394 #endif
395 }
396
397
uv__statx(int dirfd,const char * path,int flags,unsigned int mask,struct uv__statx * statxbuf)398 int uv__statx(int dirfd,
399 const char* path,
400 int flags,
401 unsigned int mask,
402 struct uv__statx* statxbuf) {
403 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
404 return errno = ENOSYS, -1;
405 #else
406 int rc;
407
408 rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
409 if (rc >= 0)
410 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
411
412 return rc;
413 #endif
414 }
415
416
uv__getrandom(void * buf,size_t buflen,unsigned flags)417 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
418 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
419 return errno = ENOSYS, -1;
420 #else
421 ssize_t rc;
422
423 rc = syscall(__NR_getrandom, buf, buflen, flags);
424 if (rc >= 0)
425 uv__msan_unpoison(buf, buflen);
426
427 return rc;
428 #endif
429 }
430
431
uv__io_uring_setup(int entries,struct uv__io_uring_params * params)432 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
433 return syscall(__NR_io_uring_setup, entries, params);
434 }
435
436
uv__io_uring_enter(int fd,unsigned to_submit,unsigned min_complete,unsigned flags)437 int uv__io_uring_enter(int fd,
438 unsigned to_submit,
439 unsigned min_complete,
440 unsigned flags) {
441 /* io_uring_enter used to take a sigset_t but it's unused
442 * in newer kernels unless IORING_ENTER_EXT_ARG is set,
443 * in which case it takes a struct io_uring_getevents_arg.
444 */
445 return syscall(__NR_io_uring_enter,
446 fd,
447 to_submit,
448 min_complete,
449 flags,
450 NULL,
451 0L);
452 }
453
454
uv__io_uring_register(int fd,unsigned opcode,void * arg,unsigned nargs)455 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
456 return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
457 }
458
459
uv__use_io_uring(void)460 static int uv__use_io_uring(void) {
461 #if defined(__ANDROID_API__)
462 return 0; /* Possibly available but blocked by seccomp. */
463 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
464 /* See https://github.com/libuv/libuv/issues/4158. */
465 return 0; /* All 32 bits kernels appear buggy. */
466 #elif defined(__powerpc64__) || defined(__ppc64__)
467 /* See https://github.com/libuv/libuv/issues/4283. */
468 return 0; /* Random SIGSEGV in signal handler. */
469 #else
470 /* Ternary: unknown=0, yes=1, no=-1 */
471 static _Atomic int use_io_uring;
472 char* val;
473 int use;
474
475 use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
476
477 if (use == 0) {
478 use = uv__kernel_version() >=
479 #if defined(__hppa__)
480 /* io_uring first supported on parisc in 6.1, functional in .51 */
481 /* https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/ */
482 /* 6.1.51 */ 0x060133
483 #else
484 /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
485 /* 5.10.186 */ 0x050ABA
486 #endif
487 ? 1 : -1;
488
489 /* But users can still enable it if they so desire. */
490 val = getenv("UV_USE_IO_URING");
491 if (val != NULL)
492 use = atoi(val) ? 1 : -1;
493
494 atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
495 }
496
497 return use > 0;
498 #endif
499 }
500
501
uv__iou_init(int epollfd,struct uv__iou * iou,uint32_t entries,uint32_t flags)502 static void uv__iou_init(int epollfd,
503 struct uv__iou* iou,
504 uint32_t entries,
505 uint32_t flags) {
506 struct uv__io_uring_params params;
507 struct epoll_event e;
508 size_t cqlen;
509 size_t sqlen;
510 size_t maxlen;
511 size_t sqelen;
512 uint32_t i;
513 char* sq;
514 char* sqe;
515 int ringfd;
516
517 sq = MAP_FAILED;
518 sqe = MAP_FAILED;
519
520 if (!uv__use_io_uring())
521 return;
522
523 /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
524 * Mostly academic because we check for a v5.13 kernel afterwards anyway.
525 */
526 memset(¶ms, 0, sizeof(params));
527 params.flags = flags;
528
529 if (flags & UV__IORING_SETUP_SQPOLL)
530 params.sq_thread_idle = 10; /* milliseconds */
531
532 /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
533 ringfd = uv__io_uring_setup(entries, ¶ms);
534 if (ringfd == -1)
535 return;
536
537 /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
538 * actually detecting is whether IORING_OP_STATX works with SQPOLL.
539 */
540 if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
541 goto fail;
542
543 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
544 if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
545 goto fail;
546
547 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
548 if (!(params.features & UV__IORING_FEAT_NODROP))
549 goto fail;
550
551 sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
552 cqlen =
553 params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
554 maxlen = sqlen < cqlen ? cqlen : sqlen;
555 sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
556
557 sq = mmap(0,
558 maxlen,
559 PROT_READ | PROT_WRITE,
560 MAP_SHARED | MAP_POPULATE,
561 ringfd,
562 0); /* IORING_OFF_SQ_RING */
563
564 sqe = mmap(0,
565 sqelen,
566 PROT_READ | PROT_WRITE,
567 MAP_SHARED | MAP_POPULATE,
568 ringfd,
569 0x10000000ull); /* IORING_OFF_SQES */
570
571 if (sq == MAP_FAILED || sqe == MAP_FAILED)
572 goto fail;
573
574 if (flags & UV__IORING_SETUP_SQPOLL) {
575 /* Only interested in completion events. To get notified when
576 * the kernel pulls items from the submission ring, add POLLOUT.
577 */
578 memset(&e, 0, sizeof(e));
579 e.events = POLLIN;
580 e.data.fd = ringfd;
581
582 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
583 goto fail;
584 }
585
586 iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
587 iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
588 iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
589 iou->sqarray = (uint32_t*) (sq + params.sq_off.array);
590 iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
591 iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
592 iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
593 iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
594 iou->sq = sq;
595 iou->cqe = sq + params.cq_off.cqes;
596 iou->sqe = sqe;
597 iou->sqlen = sqlen;
598 iou->cqlen = cqlen;
599 iou->maxlen = maxlen;
600 iou->sqelen = sqelen;
601 iou->ringfd = ringfd;
602 iou->in_flight = 0;
603 iou->flags = 0;
604
605 if (uv__kernel_version() >= /* 5.15.0 */ 0x050F00)
606 iou->flags |= UV__MKDIRAT_SYMLINKAT_LINKAT;
607
608 for (i = 0; i <= iou->sqmask; i++)
609 iou->sqarray[i] = i; /* Slot -> sqe identity mapping. */
610
611 return;
612
613 fail:
614 if (sq != MAP_FAILED)
615 munmap(sq, maxlen);
616
617 if (sqe != MAP_FAILED)
618 munmap(sqe, sqelen);
619
620 uv__close(ringfd);
621 }
622
623
uv__iou_delete(struct uv__iou * iou)624 static void uv__iou_delete(struct uv__iou* iou) {
625 if (iou->ringfd > -1) {
626 munmap(iou->sq, iou->maxlen);
627 munmap(iou->sqe, iou->sqelen);
628 uv__close(iou->ringfd);
629 iou->ringfd = -1;
630 }
631 }
632
633
uv__platform_loop_init(uv_loop_t * loop)634 int uv__platform_loop_init(uv_loop_t* loop) {
635 uv__loop_internal_fields_t* lfields;
636
637 lfields = uv__get_internal_fields(loop);
638 lfields->ctl.ringfd = -1;
639 lfields->iou.ringfd = -2; /* "uninitialized" */
640
641 loop->inotify_watchers = NULL;
642 loop->inotify_fd = -1;
643 loop->backend_fd = epoll_create1(O_CLOEXEC);
644
645 if (loop->backend_fd == -1)
646 return UV__ERR(errno);
647
648 uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
649
650 return 0;
651 }
652
653
uv__io_fork(uv_loop_t * loop)654 int uv__io_fork(uv_loop_t* loop) {
655 int err;
656 struct watcher_list* root;
657
658 root = uv__inotify_watchers(loop)->rbh_root;
659
660 uv__close(loop->backend_fd);
661 loop->backend_fd = -1;
662
663 /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
664 uv__platform_loop_delete(loop);
665
666 err = uv__platform_loop_init(loop);
667 if (err)
668 return err;
669
670 return uv__inotify_fork(loop, root);
671 }
672
673
uv__platform_loop_delete(uv_loop_t * loop)674 void uv__platform_loop_delete(uv_loop_t* loop) {
675 uv__loop_internal_fields_t* lfields;
676
677 lfields = uv__get_internal_fields(loop);
678 uv__iou_delete(&lfields->ctl);
679 uv__iou_delete(&lfields->iou);
680
681 if (loop->inotify_fd != -1) {
682 uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
683 uv__close(loop->inotify_fd);
684 loop->inotify_fd = -1;
685 }
686 }
687
688
689 struct uv__invalidate {
690 struct epoll_event (*prep)[256];
691 struct epoll_event* events;
692 int nfds;
693 };
694
695
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)696 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
697 uv__loop_internal_fields_t* lfields;
698 struct uv__invalidate* inv;
699 struct epoll_event dummy;
700 int i;
701
702 lfields = uv__get_internal_fields(loop);
703 inv = lfields->inv;
704
705 /* Invalidate events with same file descriptor */
706 if (inv != NULL)
707 for (i = 0; i < inv->nfds; i++)
708 if (inv->events[i].data.fd == fd)
709 inv->events[i].data.fd = -1;
710
711 /* Remove the file descriptor from the epoll.
712 * This avoids a problem where the same file description remains open
713 * in another process, causing repeated junk epoll events.
714 *
715 * Perform EPOLL_CTL_DEL immediately instead of going through
716 * io_uring's submit queue, otherwise the file descriptor may
717 * be closed by the time the kernel starts the operation.
718 *
719 * We pass in a dummy epoll_event, to work around a bug in old kernels.
720 *
721 * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
722 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
723 */
724 memset(&dummy, 0, sizeof(dummy));
725 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
726 }
727
728
uv__io_check_fd(uv_loop_t * loop,int fd)729 int uv__io_check_fd(uv_loop_t* loop, int fd) {
730 struct epoll_event e;
731 int rc;
732
733 memset(&e, 0, sizeof(e));
734 e.events = POLLIN;
735 e.data.fd = -1;
736
737 rc = 0;
738 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
739 if (errno != EEXIST)
740 rc = UV__ERR(errno);
741
742 if (rc == 0)
743 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
744 abort();
745
746 return rc;
747 }
748
749
750 /* Caller must initialize SQE and call uv__iou_submit(). */
uv__iou_get_sqe(struct uv__iou * iou,uv_loop_t * loop,uv_fs_t * req)751 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
752 uv_loop_t* loop,
753 uv_fs_t* req) {
754 struct uv__io_uring_sqe* sqe;
755 uint32_t head;
756 uint32_t tail;
757 uint32_t mask;
758 uint32_t slot;
759
760 /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
761 * initialization failed. Anything else is a valid ring file descriptor.
762 */
763 if (iou->ringfd == -2) {
764 uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
765 if (iou->ringfd == -2)
766 iou->ringfd = -1; /* "failed" */
767 }
768
769 if (iou->ringfd == -1)
770 return NULL;
771
772 head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
773 memory_order_acquire);
774 tail = *iou->sqtail;
775 mask = iou->sqmask;
776
777 if ((head & mask) == ((tail + 1) & mask))
778 return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
779
780 slot = tail & mask;
781 sqe = iou->sqe;
782 sqe = &sqe[slot];
783 memset(sqe, 0, sizeof(*sqe));
784 sqe->user_data = (uintptr_t) req;
785
786 /* Pacify uv_cancel(). */
787 req->work_req.loop = loop;
788 req->work_req.work = NULL;
789 req->work_req.done = NULL;
790 uv__queue_init(&req->work_req.wq);
791
792 uv__req_register(loop, req);
793 iou->in_flight++;
794
795 return sqe;
796 }
797
798
uv__iou_submit(struct uv__iou * iou)799 static void uv__iou_submit(struct uv__iou* iou) {
800 uint32_t flags;
801
802 atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
803 *iou->sqtail + 1,
804 memory_order_release);
805
806 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
807 memory_order_acquire);
808
809 if (flags & UV__IORING_SQ_NEED_WAKEUP)
810 if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
811 if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */
812 perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */
813 }
814
815
uv__iou_fs_close(uv_loop_t * loop,uv_fs_t * req)816 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
817 struct uv__io_uring_sqe* sqe;
818 struct uv__iou* iou;
819 int kv;
820
821 kv = uv__kernel_version();
822 /* Work around a poorly understood bug in older kernels where closing a file
823 * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
824 * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
825 * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
826 * but good candidates are the several data race fixes. Interestingly, it
827 * seems to manifest only when running under Docker so the possibility of
828 * a Docker bug can't be completely ruled out either. Yay, computers.
829 * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
830 * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
831 * solved.
832 */
833 if (kv < /* 5.15.90 */ 0x050F5A)
834 return 0;
835
836 if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
837 return 0;
838
839
840 iou = &uv__get_internal_fields(loop)->iou;
841
842 sqe = uv__iou_get_sqe(iou, loop, req);
843 if (sqe == NULL)
844 return 0;
845
846 sqe->fd = req->file;
847 sqe->opcode = UV__IORING_OP_CLOSE;
848
849 uv__iou_submit(iou);
850
851 return 1;
852 }
853
854
uv__iou_fs_fsync_or_fdatasync(uv_loop_t * loop,uv_fs_t * req,uint32_t fsync_flags)855 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
856 uv_fs_t* req,
857 uint32_t fsync_flags) {
858 struct uv__io_uring_sqe* sqe;
859 struct uv__iou* iou;
860
861 iou = &uv__get_internal_fields(loop)->iou;
862
863 sqe = uv__iou_get_sqe(iou, loop, req);
864 if (sqe == NULL)
865 return 0;
866
867 /* Little known fact: setting seq->off and seq->len turns
868 * it into an asynchronous sync_file_range() operation.
869 */
870 sqe->fd = req->file;
871 sqe->fsync_flags = fsync_flags;
872 sqe->opcode = UV__IORING_OP_FSYNC;
873
874 uv__iou_submit(iou);
875
876 return 1;
877 }
878
879
uv__iou_fs_link(uv_loop_t * loop,uv_fs_t * req)880 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
881 struct uv__io_uring_sqe* sqe;
882 struct uv__iou* iou;
883
884 iou = &uv__get_internal_fields(loop)->iou;
885
886 if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
887 return 0;
888
889 sqe = uv__iou_get_sqe(iou, loop, req);
890 if (sqe == NULL)
891 return 0;
892
893 sqe->addr = (uintptr_t) req->path;
894 sqe->fd = AT_FDCWD;
895 sqe->addr2 = (uintptr_t) req->new_path;
896 sqe->len = AT_FDCWD;
897 sqe->opcode = UV__IORING_OP_LINKAT;
898
899 uv__iou_submit(iou);
900
901 return 1;
902 }
903
904
uv__iou_fs_mkdir(uv_loop_t * loop,uv_fs_t * req)905 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
906 struct uv__io_uring_sqe* sqe;
907 struct uv__iou* iou;
908
909 iou = &uv__get_internal_fields(loop)->iou;
910
911 if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
912 return 0;
913
914 sqe = uv__iou_get_sqe(iou, loop, req);
915 if (sqe == NULL)
916 return 0;
917
918 sqe->addr = (uintptr_t) req->path;
919 sqe->fd = AT_FDCWD;
920 sqe->len = req->mode;
921 sqe->opcode = UV__IORING_OP_MKDIRAT;
922
923 uv__iou_submit(iou);
924
925 return 1;
926 }
927
928
uv__iou_fs_open(uv_loop_t * loop,uv_fs_t * req)929 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
930 struct uv__io_uring_sqe* sqe;
931 struct uv__iou* iou;
932
933 iou = &uv__get_internal_fields(loop)->iou;
934
935 sqe = uv__iou_get_sqe(iou, loop, req);
936 if (sqe == NULL)
937 return 0;
938
939 sqe->addr = (uintptr_t) req->path;
940 sqe->fd = AT_FDCWD;
941 sqe->len = req->mode;
942 sqe->opcode = UV__IORING_OP_OPENAT;
943 sqe->open_flags = req->flags | O_CLOEXEC;
944
945 uv__iou_submit(iou);
946
947 return 1;
948 }
949
950
uv__iou_fs_rename(uv_loop_t * loop,uv_fs_t * req)951 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
952 struct uv__io_uring_sqe* sqe;
953 struct uv__iou* iou;
954
955 iou = &uv__get_internal_fields(loop)->iou;
956
957 sqe = uv__iou_get_sqe(iou, loop, req);
958 if (sqe == NULL)
959 return 0;
960
961 sqe->addr = (uintptr_t) req->path;
962 sqe->fd = AT_FDCWD;
963 sqe->addr2 = (uintptr_t) req->new_path;
964 sqe->len = AT_FDCWD;
965 sqe->opcode = UV__IORING_OP_RENAMEAT;
966
967 uv__iou_submit(iou);
968
969 return 1;
970 }
971
972
uv__iou_fs_symlink(uv_loop_t * loop,uv_fs_t * req)973 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
974 struct uv__io_uring_sqe* sqe;
975 struct uv__iou* iou;
976
977 iou = &uv__get_internal_fields(loop)->iou;
978
979 if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
980 return 0;
981
982 sqe = uv__iou_get_sqe(iou, loop, req);
983 if (sqe == NULL)
984 return 0;
985
986 sqe->addr = (uintptr_t) req->path;
987 sqe->fd = AT_FDCWD;
988 sqe->addr2 = (uintptr_t) req->new_path;
989 sqe->opcode = UV__IORING_OP_SYMLINKAT;
990
991 uv__iou_submit(iou);
992
993 return 1;
994 }
995
996
uv__iou_fs_unlink(uv_loop_t * loop,uv_fs_t * req)997 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
998 struct uv__io_uring_sqe* sqe;
999 struct uv__iou* iou;
1000
1001 iou = &uv__get_internal_fields(loop)->iou;
1002
1003 sqe = uv__iou_get_sqe(iou, loop, req);
1004 if (sqe == NULL)
1005 return 0;
1006
1007 sqe->addr = (uintptr_t) req->path;
1008 sqe->fd = AT_FDCWD;
1009 sqe->opcode = UV__IORING_OP_UNLINKAT;
1010
1011 uv__iou_submit(iou);
1012
1013 return 1;
1014 }
1015
1016
uv__iou_fs_read_or_write(uv_loop_t * loop,uv_fs_t * req,int is_read)1017 int uv__iou_fs_read_or_write(uv_loop_t* loop,
1018 uv_fs_t* req,
1019 int is_read) {
1020 struct uv__io_uring_sqe* sqe;
1021 struct uv__iou* iou;
1022
1023 /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1024 * to the threadpool on writes */
1025 if (req->nbufs > IOV_MAX) {
1026 if (is_read)
1027 req->nbufs = IOV_MAX;
1028 else
1029 return 0;
1030 }
1031
1032 iou = &uv__get_internal_fields(loop)->iou;
1033
1034 sqe = uv__iou_get_sqe(iou, loop, req);
1035 if (sqe == NULL)
1036 return 0;
1037
1038 sqe->addr = (uintptr_t) req->bufs;
1039 sqe->fd = req->file;
1040 sqe->len = req->nbufs;
1041 sqe->off = req->off < 0 ? -1 : req->off;
1042 sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1043
1044 uv__iou_submit(iou);
1045
1046 return 1;
1047 }
1048
1049
uv__iou_fs_statx(uv_loop_t * loop,uv_fs_t * req,int is_fstat,int is_lstat)1050 int uv__iou_fs_statx(uv_loop_t* loop,
1051 uv_fs_t* req,
1052 int is_fstat,
1053 int is_lstat) {
1054 struct uv__io_uring_sqe* sqe;
1055 struct uv__statx* statxbuf;
1056 struct uv__iou* iou;
1057
1058 statxbuf = uv__malloc(sizeof(*statxbuf));
1059 if (statxbuf == NULL)
1060 return 0;
1061
1062 iou = &uv__get_internal_fields(loop)->iou;
1063
1064 sqe = uv__iou_get_sqe(iou, loop, req);
1065 if (sqe == NULL) {
1066 uv__free(statxbuf);
1067 return 0;
1068 }
1069
1070 req->ptr = statxbuf;
1071
1072 sqe->addr = (uintptr_t) req->path;
1073 sqe->addr2 = (uintptr_t) statxbuf;
1074 sqe->fd = AT_FDCWD;
1075 sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1076 sqe->opcode = UV__IORING_OP_STATX;
1077
1078 if (is_fstat) {
1079 sqe->addr = (uintptr_t) "";
1080 sqe->fd = req->file;
1081 sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1082 }
1083
1084 if (is_lstat)
1085 sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1086
1087 uv__iou_submit(iou);
1088
1089 return 1;
1090 }
1091
1092
uv__statx_to_stat(const struct uv__statx * statxbuf,uv_stat_t * buf)1093 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1094 buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1095 buf->st_mode = statxbuf->stx_mode;
1096 buf->st_nlink = statxbuf->stx_nlink;
1097 buf->st_uid = statxbuf->stx_uid;
1098 buf->st_gid = statxbuf->stx_gid;
1099 buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1100 buf->st_ino = statxbuf->stx_ino;
1101 buf->st_size = statxbuf->stx_size;
1102 buf->st_blksize = statxbuf->stx_blksize;
1103 buf->st_blocks = statxbuf->stx_blocks;
1104 buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1105 buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1106 buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1107 buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1108 buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1109 buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1110 buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1111 buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1112 buf->st_flags = 0;
1113 buf->st_gen = 0;
1114 }
1115
1116
uv__iou_fs_statx_post(uv_fs_t * req)1117 static void uv__iou_fs_statx_post(uv_fs_t* req) {
1118 struct uv__statx* statxbuf;
1119 uv_stat_t* buf;
1120
1121 buf = &req->statbuf;
1122 statxbuf = req->ptr;
1123 req->ptr = NULL;
1124
1125 if (req->result == 0) {
1126 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1127 uv__statx_to_stat(statxbuf, buf);
1128 req->ptr = buf;
1129 }
1130
1131 uv__free(statxbuf);
1132 }
1133
1134
uv__poll_io_uring(uv_loop_t * loop,struct uv__iou * iou)1135 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1136 struct uv__io_uring_cqe* cqe;
1137 struct uv__io_uring_cqe* e;
1138 uv_fs_t* req;
1139 uint32_t head;
1140 uint32_t tail;
1141 uint32_t mask;
1142 uint32_t i;
1143 uint32_t flags;
1144 int nevents;
1145 int rc;
1146
1147 head = *iou->cqhead;
1148 tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1149 memory_order_acquire);
1150 mask = iou->cqmask;
1151 cqe = iou->cqe;
1152 nevents = 0;
1153
1154 for (i = head; i != tail; i++) {
1155 e = &cqe[i & mask];
1156
1157 req = (uv_fs_t*) (uintptr_t) e->user_data;
1158 assert(req->type == UV_FS);
1159
1160 uv__req_unregister(loop, req);
1161 iou->in_flight--;
1162
1163 /* If the op is not supported by the kernel retry using the thread pool */
1164 if (e->res == -EOPNOTSUPP) {
1165 uv__fs_post(loop, req);
1166 continue;
1167 }
1168
1169 /* io_uring stores error codes as negative numbers, same as libuv. */
1170 req->result = e->res;
1171
1172 switch (req->fs_type) {
1173 case UV_FS_FSTAT:
1174 case UV_FS_LSTAT:
1175 case UV_FS_STAT:
1176 uv__iou_fs_statx_post(req);
1177 break;
1178 default: /* Squelch -Wswitch warnings. */
1179 break;
1180 }
1181
1182 uv__metrics_update_idle_time(loop);
1183 req->cb(req);
1184 nevents++;
1185 }
1186
1187 atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1188 tail,
1189 memory_order_release);
1190
1191 /* Check whether CQE's overflowed, if so enter the kernel to make them
1192 * available. Don't grab them immediately but in the next loop iteration to
1193 * avoid loop starvation. */
1194 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1195 memory_order_acquire);
1196
1197 if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1198 do
1199 rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1200 while (rc == -1 && errno == EINTR);
1201
1202 if (rc < 0)
1203 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1204 }
1205
1206 uv__metrics_inc_events(loop, nevents);
1207 if (uv__get_internal_fields(loop)->current_timeout == 0)
1208 uv__metrics_inc_events_waiting(loop, nevents);
1209 }
1210
1211
1212 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1213 * executed immediately, otherwise the file descriptor may have been closed
1214 * by the time the kernel starts the operation.
1215 */
uv__epoll_ctl_prep(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256],int op,int fd,struct epoll_event * e)1216 static void uv__epoll_ctl_prep(int epollfd,
1217 struct uv__iou* ctl,
1218 struct epoll_event (*events)[256],
1219 int op,
1220 int fd,
1221 struct epoll_event* e) {
1222 struct uv__io_uring_sqe* sqe;
1223 struct epoll_event* pe;
1224 uint32_t mask;
1225 uint32_t slot;
1226
1227 assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1228 assert(ctl->ringfd != -1);
1229
1230 mask = ctl->sqmask;
1231 slot = (*ctl->sqtail)++ & mask;
1232
1233 pe = &(*events)[slot];
1234 *pe = *e;
1235
1236 sqe = ctl->sqe;
1237 sqe = &sqe[slot];
1238
1239 memset(sqe, 0, sizeof(*sqe));
1240 sqe->addr = (uintptr_t) pe;
1241 sqe->fd = epollfd;
1242 sqe->len = op;
1243 sqe->off = fd;
1244 sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1245 sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1246
1247 if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1248 uv__epoll_ctl_flush(epollfd, ctl, events);
1249 }
1250
1251
uv__epoll_ctl_flush(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256])1252 static void uv__epoll_ctl_flush(int epollfd,
1253 struct uv__iou* ctl,
1254 struct epoll_event (*events)[256]) {
1255 struct epoll_event oldevents[256];
1256 struct uv__io_uring_cqe* cqe;
1257 uint32_t oldslot;
1258 uint32_t slot;
1259 uint32_t n;
1260 int fd;
1261 int op;
1262 int rc;
1263
1264 STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1265 assert(ctl->ringfd != -1);
1266 assert(*ctl->sqhead != *ctl->sqtail);
1267
1268 n = *ctl->sqtail - *ctl->sqhead;
1269 do
1270 rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1271 while (rc == -1 && errno == EINTR);
1272
1273 if (rc < 0)
1274 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1275
1276 if (rc != (int) n)
1277 abort();
1278
1279 assert(*ctl->sqhead == *ctl->sqtail);
1280
1281 memcpy(oldevents, *events, sizeof(*events));
1282
1283 /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1284 * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1285 * that we are already watching. Ignore the former and retry the latter
1286 * with EPOLL_CTL_MOD.
1287 */
1288 while (*ctl->cqhead != *ctl->cqtail) {
1289 slot = (*ctl->cqhead)++ & ctl->cqmask;
1290
1291 cqe = ctl->cqe;
1292 cqe = &cqe[slot];
1293
1294 if (cqe->res == 0)
1295 continue;
1296
1297 fd = cqe->user_data >> 32;
1298 op = 3 & cqe->user_data;
1299 oldslot = 255 & (cqe->user_data >> 2);
1300
1301 if (op == EPOLL_CTL_DEL)
1302 continue;
1303
1304 if (op != EPOLL_CTL_ADD)
1305 abort();
1306
1307 if (cqe->res != -EEXIST)
1308 abort();
1309
1310 uv__epoll_ctl_prep(epollfd,
1311 ctl,
1312 events,
1313 EPOLL_CTL_MOD,
1314 fd,
1315 &oldevents[oldslot]);
1316 }
1317 }
1318
1319
uv__io_poll(uv_loop_t * loop,int timeout)1320 void uv__io_poll(uv_loop_t* loop, int timeout) {
1321 uv__loop_internal_fields_t* lfields;
1322 struct epoll_event events[1024];
1323 struct epoll_event prep[256];
1324 struct uv__invalidate inv;
1325 struct epoll_event* pe;
1326 struct epoll_event e;
1327 struct uv__iou* ctl;
1328 struct uv__iou* iou;
1329 int real_timeout;
1330 struct uv__queue* q;
1331 uv__io_t* w;
1332 sigset_t* sigmask;
1333 sigset_t sigset;
1334 uint64_t base;
1335 int have_iou_events;
1336 int have_signals;
1337 int nevents;
1338 int epollfd;
1339 int count;
1340 int nfds;
1341 int fd;
1342 int op;
1343 int i;
1344 int user_timeout;
1345 int reset_timeout;
1346
1347 lfields = uv__get_internal_fields(loop);
1348 ctl = &lfields->ctl;
1349 iou = &lfields->iou;
1350
1351 sigmask = NULL;
1352 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1353 sigemptyset(&sigset);
1354 sigaddset(&sigset, SIGPROF);
1355 sigmask = &sigset;
1356 }
1357
1358 assert(timeout >= -1);
1359 base = loop->time;
1360 count = 48; /* Benchmarks suggest this gives the best throughput. */
1361 real_timeout = timeout;
1362
1363 if (lfields->flags & UV_METRICS_IDLE_TIME) {
1364 reset_timeout = 1;
1365 user_timeout = timeout;
1366 timeout = 0;
1367 } else {
1368 reset_timeout = 0;
1369 user_timeout = 0;
1370 }
1371
1372 epollfd = loop->backend_fd;
1373
1374 memset(&e, 0, sizeof(e));
1375
1376 while (!uv__queue_empty(&loop->watcher_queue)) {
1377 q = uv__queue_head(&loop->watcher_queue);
1378 w = uv__queue_data(q, uv__io_t, watcher_queue);
1379 uv__queue_remove(q);
1380 uv__queue_init(q);
1381
1382 op = EPOLL_CTL_MOD;
1383 if (w->events == 0)
1384 op = EPOLL_CTL_ADD;
1385
1386 w->events = w->pevents;
1387 e.events = w->pevents;
1388 e.data.fd = w->fd;
1389 fd = w->fd;
1390
1391 if (ctl->ringfd != -1) {
1392 uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1393 continue;
1394 }
1395
1396 if (!epoll_ctl(epollfd, op, fd, &e))
1397 continue;
1398
1399 assert(op == EPOLL_CTL_ADD);
1400 assert(errno == EEXIST);
1401
1402 /* File descriptor that's been watched before, update event mask. */
1403 if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1404 abort();
1405 }
1406
1407 inv.events = events;
1408 inv.prep = &prep;
1409 inv.nfds = -1;
1410
1411 for (;;) {
1412 if (loop->nfds == 0)
1413 if (iou->in_flight == 0)
1414 break;
1415
1416 /* All event mask mutations should be visible to the kernel before
1417 * we enter epoll_pwait().
1418 */
1419 if (ctl->ringfd != -1)
1420 while (*ctl->sqhead != *ctl->sqtail)
1421 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1422
1423 /* Only need to set the provider_entry_time if timeout != 0. The function
1424 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
1425 */
1426 if (timeout != 0)
1427 uv__metrics_set_provider_entry_time(loop);
1428
1429 /* Store the current timeout in a location that's globally accessible so
1430 * other locations like uv__work_done() can determine whether the queue
1431 * of events in the callback were waiting when poll was called.
1432 */
1433 lfields->current_timeout = timeout;
1434
1435 nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1436
1437 /* Update loop->time unconditionally. It's tempting to skip the update when
1438 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
1439 * operating system didn't reschedule our process while in the syscall.
1440 */
1441 SAVE_ERRNO(uv__update_time(loop));
1442
1443 if (nfds == -1)
1444 assert(errno == EINTR);
1445 else if (nfds == 0)
1446 /* Unlimited timeout should only return with events or signal. */
1447 assert(timeout != -1);
1448
1449 if (nfds == 0 || nfds == -1) {
1450 if (reset_timeout != 0) {
1451 timeout = user_timeout;
1452 reset_timeout = 0;
1453 } else if (nfds == 0) {
1454 return;
1455 }
1456
1457 /* Interrupted by a signal. Update timeout and poll again. */
1458 goto update_timeout;
1459 }
1460
1461 have_iou_events = 0;
1462 have_signals = 0;
1463 nevents = 0;
1464
1465 inv.nfds = nfds;
1466 lfields->inv = &inv;
1467
1468 for (i = 0; i < nfds; i++) {
1469 pe = events + i;
1470 fd = pe->data.fd;
1471
1472 /* Skip invalidated events, see uv__platform_invalidate_fd */
1473 if (fd == -1)
1474 continue;
1475
1476 if (fd == iou->ringfd) {
1477 uv__poll_io_uring(loop, iou);
1478 have_iou_events = 1;
1479 continue;
1480 }
1481
1482 assert(fd >= 0);
1483 assert((unsigned) fd < loop->nwatchers);
1484
1485 w = loop->watchers[fd];
1486
1487 if (w == NULL) {
1488 /* File descriptor that we've stopped watching, disarm it.
1489 *
1490 * Ignore all errors because we may be racing with another thread
1491 * when the file descriptor is closed.
1492 *
1493 * Perform EPOLL_CTL_DEL immediately instead of going through
1494 * io_uring's submit queue, otherwise the file descriptor may
1495 * be closed by the time the kernel starts the operation.
1496 */
1497 epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1498 continue;
1499 }
1500
1501 /* Give users only events they're interested in. Prevents spurious
1502 * callbacks when previous callback invocation in this loop has stopped
1503 * the current watcher. Also, filters out events that users has not
1504 * requested us to watch.
1505 */
1506 pe->events &= w->pevents | POLLERR | POLLHUP;
1507
1508 /* Work around an epoll quirk where it sometimes reports just the
1509 * EPOLLERR or EPOLLHUP event. In order to force the event loop to
1510 * move forward, we merge in the read/write events that the watcher
1511 * is interested in; uv__read() and uv__write() will then deal with
1512 * the error or hangup in the usual fashion.
1513 *
1514 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1515 * reads the available data, calls uv_read_stop(), then sometime later
1516 * calls uv_read_start() again. By then, libuv has forgotten about the
1517 * hangup and the kernel won't report EPOLLIN again because there's
1518 * nothing left to read. If anything, libuv is to blame here. The
1519 * current hack is just a quick bandaid; to properly fix it, libuv
1520 * needs to remember the error/hangup event. We should get that for
1521 * free when we switch over to edge-triggered I/O.
1522 */
1523 if (pe->events == POLLERR || pe->events == POLLHUP)
1524 pe->events |=
1525 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1526
1527 if (pe->events != 0) {
1528 /* Run signal watchers last. This also affects child process watchers
1529 * because those are implemented in terms of signal watchers.
1530 */
1531 if (w == &loop->signal_io_watcher) {
1532 have_signals = 1;
1533 } else {
1534 uv__metrics_update_idle_time(loop);
1535 w->cb(loop, w, pe->events);
1536 }
1537
1538 nevents++;
1539 }
1540 }
1541
1542 uv__metrics_inc_events(loop, nevents);
1543 if (reset_timeout != 0) {
1544 timeout = user_timeout;
1545 reset_timeout = 0;
1546 uv__metrics_inc_events_waiting(loop, nevents);
1547 }
1548
1549 if (have_signals != 0) {
1550 uv__metrics_update_idle_time(loop);
1551 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
1552 }
1553
1554 lfields->inv = NULL;
1555
1556 if (have_iou_events != 0)
1557 break; /* Event loop should cycle now so don't poll again. */
1558
1559 if (have_signals != 0)
1560 break; /* Event loop should cycle now so don't poll again. */
1561
1562 if (nevents != 0) {
1563 if (nfds == ARRAY_SIZE(events) && --count != 0) {
1564 /* Poll for more events but don't block this time. */
1565 timeout = 0;
1566 continue;
1567 }
1568 break;
1569 }
1570
1571 update_timeout:
1572 if (timeout == 0)
1573 break;
1574
1575 if (timeout == -1)
1576 continue;
1577
1578 assert(timeout > 0);
1579
1580 real_timeout -= (loop->time - base);
1581 if (real_timeout <= 0)
1582 break;
1583
1584 timeout = real_timeout;
1585 }
1586
1587 if (ctl->ringfd != -1)
1588 while (*ctl->sqhead != *ctl->sqtail)
1589 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1590 }
1591
uv__hrtime(uv_clocktype_t type)1592 uint64_t uv__hrtime(uv_clocktype_t type) {
1593 static _Atomic clock_t fast_clock_id = -1;
1594 struct timespec t;
1595 clock_t clock_id;
1596
1597 /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1598 * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is
1599 * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1600 * decide to make a costly system call.
1601 */
1602 /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1603 * when it has microsecond granularity or better (unlikely).
1604 */
1605 clock_id = CLOCK_MONOTONIC;
1606 if (type != UV_CLOCK_FAST)
1607 goto done;
1608
1609 clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1610 if (clock_id != -1)
1611 goto done;
1612
1613 clock_id = CLOCK_MONOTONIC;
1614 if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1615 if (t.tv_nsec <= 1 * 1000 * 1000)
1616 clock_id = CLOCK_MONOTONIC_COARSE;
1617
1618 atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1619
1620 done:
1621
1622 if (clock_gettime(clock_id, &t))
1623 return 0; /* Not really possible. */
1624
1625 return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1626 }
1627
1628
uv_resident_set_memory(size_t * rss)1629 int uv_resident_set_memory(size_t* rss) {
1630 char buf[1024];
1631 const char* s;
1632 long val;
1633 int rc;
1634 int i;
1635
1636 /* rss: 24th element */
1637 rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1638 if (rc < 0)
1639 return rc;
1640
1641 /* find the last ')' */
1642 s = strrchr(buf, ')');
1643 if (s == NULL)
1644 goto err;
1645
1646 for (i = 1; i <= 22; i++) {
1647 s = strchr(s + 1, ' ');
1648 if (s == NULL)
1649 goto err;
1650 }
1651
1652 errno = 0;
1653 val = strtol(s, NULL, 10);
1654 if (val < 0 || errno != 0)
1655 goto err;
1656
1657 *rss = val * getpagesize();
1658 return 0;
1659
1660 err:
1661 return UV_EINVAL;
1662 }
1663
uv_uptime(double * uptime)1664 int uv_uptime(double* uptime) {
1665 struct timespec now;
1666 char buf[128];
1667
1668 /* Consult /proc/uptime when present (common case), or fall back to
1669 * clock_gettime. Why not always clock_gettime? It doesn't always return the
1670 * right result under OpenVZ and possibly other containerized environments.
1671 */
1672 if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1673 if (1 == sscanf(buf, "%lf", uptime))
1674 return 0;
1675
1676 if (clock_gettime(CLOCK_BOOTTIME, &now))
1677 return UV__ERR(errno);
1678
1679 *uptime = now.tv_sec;
1680 return 0;
1681 }
1682
1683
uv_cpu_info(uv_cpu_info_t ** ci,int * count)1684 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1685 #if defined(__PPC__)
1686 static const char model_marker[] = "cpu\t\t: ";
1687 #elif defined(__arm__)
1688 static const char model_marker[] = "Processor\t: ";
1689 #elif defined(__aarch64__)
1690 static const char model_marker[] = "CPU part\t: ";
1691 #elif defined(__mips__)
1692 static const char model_marker[] = "cpu model\t\t: ";
1693 #elif defined(__loongarch__)
1694 static const char model_marker[] = "cpu family\t\t: ";
1695 #else
1696 static const char model_marker[] = "model name\t: ";
1697 #endif
1698 static const char parts[] =
1699 #ifdef __aarch64__
1700 "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n"
1701 "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n"
1702 "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n"
1703 "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1704 "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n"
1705 "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n"
1706 "0xc0d\nCortex-A17\n" /* Originally A12 */
1707 "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n"
1708 "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n"
1709 "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n"
1710 "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n"
1711 "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n"
1712 "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n"
1713 "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n"
1714 "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n"
1715 "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n"
1716 "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n"
1717 "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n"
1718 #endif
1719 "";
1720 struct cpu {
1721 unsigned long long freq, user, nice, sys, idle, irq;
1722 unsigned model;
1723 };
1724 FILE* fp;
1725 char* p;
1726 int found;
1727 int n;
1728 unsigned i;
1729 unsigned cpu;
1730 unsigned maxcpu;
1731 unsigned size;
1732 unsigned long long skip;
1733 struct cpu (*cpus)[8192]; /* Kernel maximum. */
1734 struct cpu* c;
1735 struct cpu t;
1736 char (*model)[64];
1737 unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1738 /* Assumption: even big.LITTLE systems will have only a handful
1739 * of different CPU models. Most systems will just have one.
1740 */
1741 char models[8][64];
1742 char buf[1024];
1743
1744 memset(bitmap, 0, sizeof(bitmap));
1745 memset(models, 0, sizeof(models));
1746 snprintf(*models, sizeof(*models), "unknown");
1747 maxcpu = 0;
1748
1749 cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1750 if (cpus == NULL)
1751 return UV_ENOMEM;
1752
1753 fp = uv__open_file("/proc/stat");
1754 if (fp == NULL) {
1755 uv__free(cpus);
1756 return UV__ERR(errno);
1757 }
1758
1759 if (NULL == fgets(buf, sizeof(buf), fp))
1760 abort();
1761
1762 for (;;) {
1763 memset(&t, 0, sizeof(t));
1764
1765 n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1766 &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1767
1768 if (n != 7)
1769 break;
1770
1771 if (NULL == fgets(buf, sizeof(buf), fp))
1772 abort();
1773
1774 if (cpu >= ARRAY_SIZE(*cpus))
1775 continue;
1776
1777 (*cpus)[cpu] = t;
1778
1779 bitmap[cpu >> 3] |= 1 << (cpu & 7);
1780
1781 if (cpu >= maxcpu)
1782 maxcpu = cpu + 1;
1783 }
1784
1785 fclose(fp);
1786
1787 fp = uv__open_file("/proc/cpuinfo");
1788 if (fp == NULL)
1789 goto nocpuinfo;
1790
1791 for (;;) {
1792 if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1793 break; /* Parse error. */
1794
1795 found = 0;
1796 while (!found && fgets(buf, sizeof(buf), fp))
1797 found = !strncmp(buf, model_marker, sizeof(model_marker) - 1);
1798
1799 if (!found)
1800 goto next;
1801
1802 p = buf + sizeof(model_marker) - 1;
1803 n = (int) strcspn(p, "\n");
1804
1805 /* arm64: translate CPU part code to model name. */
1806 if (*parts) {
1807 p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1808 if (p == NULL)
1809 p = "unknown";
1810 else
1811 p += n + 1;
1812 n = (int) strcspn(p, "\n");
1813 }
1814
1815 found = 0;
1816 for (model = models; !found && model < ARRAY_END(models); model++)
1817 found = !strncmp(p, *model, strlen(*model));
1818
1819 if (!found)
1820 goto next;
1821
1822 if (**model == '\0')
1823 snprintf(*model, sizeof(*model), "%.*s", n, p);
1824
1825 if (cpu < maxcpu)
1826 (*cpus)[cpu].model = model - models;
1827
1828 next:
1829 while (fgets(buf, sizeof(buf), fp))
1830 if (*buf == '\n')
1831 break;
1832 }
1833
1834 fclose(fp);
1835 fp = NULL;
1836
1837 nocpuinfo:
1838
1839 n = 0;
1840 for (cpu = 0; cpu < maxcpu; cpu++) {
1841 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1842 continue;
1843
1844 n++;
1845 snprintf(buf, sizeof(buf),
1846 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1847
1848 fp = uv__open_file(buf);
1849 if (fp == NULL)
1850 continue;
1851
1852 if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1853 abort();
1854 fclose(fp);
1855 fp = NULL;
1856 }
1857
1858 size = n * sizeof(**ci) + sizeof(models);
1859 *ci = uv__malloc(size);
1860 *count = 0;
1861
1862 if (*ci == NULL) {
1863 uv__free(cpus);
1864 return UV_ENOMEM;
1865 }
1866
1867 *count = n;
1868 p = memcpy(*ci + n, models, sizeof(models));
1869
1870 i = 0;
1871 for (cpu = 0; cpu < maxcpu; cpu++) {
1872 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1873 continue;
1874
1875 c = *cpus + cpu;
1876
1877 (*ci)[i++] = (uv_cpu_info_t) {
1878 .model = p + c->model * sizeof(*model),
1879 .speed = c->freq / 1000,
1880 /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1881 * therefore the multiplier is always 1000/100 = 10.
1882 */
1883 .cpu_times = (struct uv_cpu_times_s) {
1884 .user = 10 * c->user,
1885 .nice = 10 * c->nice,
1886 .sys = 10 * c->sys,
1887 .idle = 10 * c->idle,
1888 .irq = 10 * c->irq,
1889 },
1890 };
1891 }
1892
1893 uv__free(cpus);
1894
1895 return 0;
1896 }
1897
1898
uv__ifaddr_exclude(struct ifaddrs * ent,int exclude_type)1899 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1900 if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1901 return 1;
1902 if (ent->ifa_addr == NULL)
1903 return 1;
1904 /*
1905 * On Linux getifaddrs returns information related to the raw underlying
1906 * devices. We're not interested in this information yet.
1907 */
1908 if (ent->ifa_addr->sa_family == PF_PACKET)
1909 return exclude_type;
1910 return !exclude_type;
1911 }
1912
uv_interface_addresses(uv_interface_address_t ** addresses,int * count)1913 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1914 struct ifaddrs *addrs, *ent;
1915 uv_interface_address_t* address;
1916 int i;
1917 struct sockaddr_ll *sll;
1918
1919 *count = 0;
1920 *addresses = NULL;
1921
1922 if (getifaddrs(&addrs))
1923 return UV__ERR(errno);
1924
1925 /* Count the number of interfaces */
1926 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1927 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1928 continue;
1929
1930 (*count)++;
1931 }
1932
1933 if (*count == 0) {
1934 freeifaddrs(addrs);
1935 return 0;
1936 }
1937
1938 /* Make sure the memory is initiallized to zero using calloc() */
1939 *addresses = uv__calloc(*count, sizeof(**addresses));
1940 if (!(*addresses)) {
1941 freeifaddrs(addrs);
1942 return UV_ENOMEM;
1943 }
1944
1945 address = *addresses;
1946
1947 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1948 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1949 continue;
1950
1951 address->name = uv__strdup(ent->ifa_name);
1952
1953 if (ent->ifa_addr->sa_family == AF_INET6) {
1954 address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
1955 } else {
1956 address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
1957 }
1958
1959 if (ent->ifa_netmask->sa_family == AF_INET6) {
1960 address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
1961 } else {
1962 address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
1963 }
1964
1965 address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
1966
1967 address++;
1968 }
1969
1970 /* Fill in physical addresses for each interface */
1971 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1972 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
1973 continue;
1974
1975 address = *addresses;
1976
1977 for (i = 0; i < (*count); i++) {
1978 size_t namelen = strlen(ent->ifa_name);
1979 /* Alias interface share the same physical address */
1980 if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
1981 (address->name[namelen] == 0 || address->name[namelen] == ':')) {
1982 sll = (struct sockaddr_ll*)ent->ifa_addr;
1983 memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
1984 }
1985 address++;
1986 }
1987 }
1988
1989 freeifaddrs(addrs);
1990
1991 return 0;
1992 }
1993
1994
uv_free_interface_addresses(uv_interface_address_t * addresses,int count)1995 void uv_free_interface_addresses(uv_interface_address_t* addresses,
1996 int count) {
1997 int i;
1998
1999 for (i = 0; i < count; i++) {
2000 uv__free(addresses[i].name);
2001 }
2002
2003 uv__free(addresses);
2004 }
2005
2006
uv__set_process_title(const char * title)2007 void uv__set_process_title(const char* title) {
2008 #if defined(PR_SET_NAME)
2009 prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */
2010 #endif
2011 }
2012
2013
uv__read_proc_meminfo(const char * what)2014 static uint64_t uv__read_proc_meminfo(const char* what) {
2015 uint64_t rc;
2016 char* p;
2017 char buf[4096]; /* Large enough to hold all of /proc/meminfo. */
2018
2019 if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2020 return 0;
2021
2022 p = strstr(buf, what);
2023
2024 if (p == NULL)
2025 return 0;
2026
2027 p += strlen(what);
2028
2029 rc = 0;
2030 sscanf(p, "%" PRIu64 " kB", &rc);
2031
2032 return rc * 1024;
2033 }
2034
2035
uv_get_free_memory(void)2036 uint64_t uv_get_free_memory(void) {
2037 struct sysinfo info;
2038 uint64_t rc;
2039
2040 rc = uv__read_proc_meminfo("MemAvailable:");
2041
2042 if (rc != 0)
2043 return rc;
2044
2045 if (0 == sysinfo(&info))
2046 return (uint64_t) info.freeram * info.mem_unit;
2047
2048 return 0;
2049 }
2050
2051
uv_get_total_memory(void)2052 uint64_t uv_get_total_memory(void) {
2053 struct sysinfo info;
2054 uint64_t rc;
2055
2056 rc = uv__read_proc_meminfo("MemTotal:");
2057
2058 if (rc != 0)
2059 return rc;
2060
2061 if (0 == sysinfo(&info))
2062 return (uint64_t) info.totalram * info.mem_unit;
2063
2064 return 0;
2065 }
2066
2067
uv__read_uint64(const char * filename)2068 static uint64_t uv__read_uint64(const char* filename) {
2069 char buf[32]; /* Large enough to hold an encoded uint64_t. */
2070 uint64_t rc;
2071
2072 rc = 0;
2073 if (0 == uv__slurp(filename, buf, sizeof(buf)))
2074 if (1 != sscanf(buf, "%" PRIu64, &rc))
2075 if (0 == strcmp(buf, "max\n"))
2076 rc = UINT64_MAX;
2077
2078 return rc;
2079 }
2080
2081
2082 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2083 * finds the location and length of the memory controller mount path.
2084 * This disregards the leading / for easy concatenation of paths.
2085 * Returns NULL if the memory controller wasn't found. */
uv__cgroup1_find_memory_controller(char buf[static1024],int * n)2086 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2087 int* n) {
2088 char* p;
2089
2090 /* Seek to the memory controller line. */
2091 p = strchr(buf, ':');
2092 while (p != NULL && strncmp(p, ":memory:", 8)) {
2093 p = strchr(p, '\n');
2094 if (p != NULL)
2095 p = strchr(p, ':');
2096 }
2097
2098 if (p != NULL) {
2099 /* Determine the length of the mount path. */
2100 p = p + strlen(":memory:/");
2101 *n = (int) strcspn(p, "\n");
2102 }
2103
2104 return p;
2105 }
2106
uv__get_cgroup1_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2107 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2108 uint64_t* max) {
2109 char filename[4097];
2110 char* p;
2111 int n;
2112 uint64_t cgroup1_max;
2113
2114 /* Find out where the controller is mounted. */
2115 p = uv__cgroup1_find_memory_controller(buf, &n);
2116 if (p != NULL) {
2117 snprintf(filename, sizeof(filename),
2118 "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2119 *high = uv__read_uint64(filename);
2120
2121 snprintf(filename, sizeof(filename),
2122 "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2123 *max = uv__read_uint64(filename);
2124
2125 /* If the controller wasn't mounted, the reads above will have failed,
2126 * as indicated by uv__read_uint64 returning 0.
2127 */
2128 if (*high != 0 && *max != 0)
2129 goto update_limits;
2130 }
2131
2132 /* Fall back to the limits of the global memory controller. */
2133 *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2134 *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2135
2136 /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2137 * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2138 */
2139 update_limits:
2140 cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2141 if (*high == cgroup1_max)
2142 *high = UINT64_MAX;
2143 if (*max == cgroup1_max)
2144 *max = UINT64_MAX;
2145 }
2146
uv__get_cgroup2_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2147 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2148 uint64_t* max) {
2149 char filename[4097];
2150 char* p;
2151 int n;
2152
2153 /* Find out where the controller is mounted. */
2154 p = buf + strlen("0::/");
2155 n = (int) strcspn(p, "\n");
2156
2157 /* Read the memory limits of the controller. */
2158 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2159 *max = uv__read_uint64(filename);
2160 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2161 *high = uv__read_uint64(filename);
2162 }
2163
uv__get_cgroup_constrained_memory(char buf[static1024])2164 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2165 uint64_t high;
2166 uint64_t max;
2167
2168 /* In the case of cgroupv2, we'll only have a single entry. */
2169 if (strncmp(buf, "0::/", 4))
2170 uv__get_cgroup1_memory_limits(buf, &high, &max);
2171 else
2172 uv__get_cgroup2_memory_limits(buf, &high, &max);
2173
2174 if (high == 0 || max == 0)
2175 return 0;
2176
2177 return high < max ? high : max;
2178 }
2179
uv_get_constrained_memory(void)2180 uint64_t uv_get_constrained_memory(void) {
2181 char buf[1024];
2182
2183 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2184 return 0;
2185
2186 return uv__get_cgroup_constrained_memory(buf);
2187 }
2188
2189
uv__get_cgroup1_current_memory(char buf[static1024])2190 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2191 char filename[4097];
2192 uint64_t current;
2193 char* p;
2194 int n;
2195
2196 /* Find out where the controller is mounted. */
2197 p = uv__cgroup1_find_memory_controller(buf, &n);
2198 if (p != NULL) {
2199 snprintf(filename, sizeof(filename),
2200 "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2201 current = uv__read_uint64(filename);
2202
2203 /* If the controller wasn't mounted, the reads above will have failed,
2204 * as indicated by uv__read_uint64 returning 0.
2205 */
2206 if (current != 0)
2207 return current;
2208 }
2209
2210 /* Fall back to the usage of the global memory controller. */
2211 return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2212 }
2213
uv__get_cgroup2_current_memory(char buf[static1024])2214 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2215 char filename[4097];
2216 char* p;
2217 int n;
2218
2219 /* Find out where the controller is mounted. */
2220 p = buf + strlen("0::/");
2221 n = (int) strcspn(p, "\n");
2222
2223 snprintf(filename, sizeof(filename),
2224 "/sys/fs/cgroup/%.*s/memory.current", n, p);
2225 return uv__read_uint64(filename);
2226 }
2227
uv_get_available_memory(void)2228 uint64_t uv_get_available_memory(void) {
2229 char buf[1024];
2230 uint64_t constrained;
2231 uint64_t current;
2232 uint64_t total;
2233
2234 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2235 return 0;
2236
2237 constrained = uv__get_cgroup_constrained_memory(buf);
2238 if (constrained == 0)
2239 return uv_get_free_memory();
2240
2241 total = uv_get_total_memory();
2242 if (constrained > total)
2243 return uv_get_free_memory();
2244
2245 /* In the case of cgroupv2, we'll only have a single entry. */
2246 if (strncmp(buf, "0::/", 4))
2247 current = uv__get_cgroup1_current_memory(buf);
2248 else
2249 current = uv__get_cgroup2_current_memory(buf);
2250
2251 /* memory usage can be higher than the limit (for short bursts of time) */
2252 if (constrained < current)
2253 return 0;
2254
2255 return constrained - current;
2256 }
2257
2258
uv__get_cgroupv2_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2259 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2260 uv__cpu_constraint* constraint) {
2261 char path[256];
2262 char buf[1024];
2263 unsigned int weight;
2264 int cgroup_size;
2265 const char* cgroup_trimmed;
2266 char quota_buf[16];
2267
2268 if (strncmp(cgroup, "0::/", 4) != 0)
2269 return UV_EINVAL;
2270
2271 /* Trim ending \n by replacing it with a 0 */
2272 cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */
2273 cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first slash */
2274
2275 /* Construct the path to the cpu.max file */
2276 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.max", cgroup_size,
2277 cgroup_trimmed);
2278
2279 /* Read cpu.max */
2280 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2281 return UV_EIO;
2282
2283 if (sscanf(buf, "%15s %llu", quota_buf, &constraint->period_length) != 2)
2284 return UV_EINVAL;
2285
2286 if (strncmp(quota_buf, "max", 3) == 0)
2287 constraint->quota_per_period = LLONG_MAX;
2288 else if (sscanf(quota_buf, "%lld", &constraint->quota_per_period) != 1)
2289 return UV_EINVAL; // conversion failed
2290
2291 /* Construct the path to the cpu.weight file */
2292 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.weight", cgroup_size,
2293 cgroup_trimmed);
2294
2295 /* Read cpu.weight */
2296 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2297 return UV_EIO;
2298
2299 if (sscanf(buf, "%u", &weight) != 1)
2300 return UV_EINVAL;
2301
2302 constraint->proportions = (double)weight / 100.0;
2303
2304 return 0;
2305 }
2306
uv__cgroup1_find_cpu_controller(const char * cgroup,int * cgroup_size)2307 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2308 int* cgroup_size) {
2309 /* Seek to the cpu controller line. */
2310 char* cgroup_cpu = strstr(cgroup, ":cpu,");
2311
2312 if (cgroup_cpu != NULL) {
2313 /* Skip the controller prefix to the start of the cgroup path. */
2314 cgroup_cpu += sizeof(":cpu,") - 1;
2315 /* Determine the length of the cgroup path, excluding the newline. */
2316 *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2317 }
2318
2319 return cgroup_cpu;
2320 }
2321
uv__get_cgroupv1_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2322 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2323 uv__cpu_constraint* constraint) {
2324 char path[256];
2325 char buf[1024];
2326 unsigned int shares;
2327 int cgroup_size;
2328 char* cgroup_cpu;
2329
2330 cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2331
2332 if (cgroup_cpu == NULL)
2333 return UV_EIO;
2334
2335 /* Construct the path to the cpu.cfs_quota_us file */
2336 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2337 cgroup_size, cgroup_cpu);
2338
2339 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2340 return UV_EIO;
2341
2342 if (sscanf(buf, "%lld", &constraint->quota_per_period) != 1)
2343 return UV_EINVAL;
2344
2345 /* Construct the path to the cpu.cfs_period_us file */
2346 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2347 cgroup_size, cgroup_cpu);
2348
2349 /* Read cpu.cfs_period_us */
2350 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2351 return UV_EIO;
2352
2353 if (sscanf(buf, "%lld", &constraint->period_length) != 1)
2354 return UV_EINVAL;
2355
2356 /* Construct the path to the cpu.shares file */
2357 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.shares", cgroup_size,
2358 cgroup_cpu);
2359
2360 /* Read cpu.shares */
2361 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2362 return UV_EIO;
2363
2364 if (sscanf(buf, "%u", &shares) != 1)
2365 return UV_EINVAL;
2366
2367 constraint->proportions = (double)shares / 1024.0;
2368
2369 return 0;
2370 }
2371
uv__get_constrained_cpu(uv__cpu_constraint * constraint)2372 int uv__get_constrained_cpu(uv__cpu_constraint* constraint) {
2373 char cgroup[1024];
2374
2375 /* Read the cgroup from /proc/self/cgroup */
2376 if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2377 return UV_EIO;
2378
2379 /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2380 * The entry for cgroup v2 is always in the format "0::$PATH"
2381 * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2382 if (strncmp(cgroup, "0::/", 4) == 0)
2383 return uv__get_cgroupv2_constrained_cpu(cgroup, constraint);
2384 else
2385 return uv__get_cgroupv1_constrained_cpu(cgroup, constraint);
2386 }
2387
2388
uv_loadavg(double avg[3])2389 void uv_loadavg(double avg[3]) {
2390 struct sysinfo info;
2391 char buf[128]; /* Large enough to hold all of /proc/loadavg. */
2392
2393 if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2394 if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2395 return;
2396
2397 if (sysinfo(&info) < 0)
2398 return;
2399
2400 avg[0] = (double) info.loads[0] / 65536.0;
2401 avg[1] = (double) info.loads[1] / 65536.0;
2402 avg[2] = (double) info.loads[2] / 65536.0;
2403 }
2404
2405
compare_watchers(const struct watcher_list * a,const struct watcher_list * b)2406 static int compare_watchers(const struct watcher_list* a,
2407 const struct watcher_list* b) {
2408 if (a->wd < b->wd) return -1;
2409 if (a->wd > b->wd) return 1;
2410 return 0;
2411 }
2412
2413
init_inotify(uv_loop_t * loop)2414 static int init_inotify(uv_loop_t* loop) {
2415 int fd;
2416
2417 if (loop->inotify_fd != -1)
2418 return 0;
2419
2420 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2421 if (fd < 0)
2422 return UV__ERR(errno);
2423
2424 loop->inotify_fd = fd;
2425 uv__io_init(&loop->inotify_read_watcher, uv__inotify_read, loop->inotify_fd);
2426 uv__io_start(loop, &loop->inotify_read_watcher, POLLIN);
2427
2428 return 0;
2429 }
2430
2431
uv__inotify_fork(uv_loop_t * loop,struct watcher_list * root)2432 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2433 /* Open the inotify_fd, and re-arm all the inotify watchers. */
2434 int err;
2435 struct watcher_list* tmp_watcher_list_iter;
2436 struct watcher_list* watcher_list;
2437 struct watcher_list tmp_watcher_list;
2438 struct uv__queue queue;
2439 struct uv__queue* q;
2440 uv_fs_event_t* handle;
2441 char* tmp_path;
2442
2443 if (root == NULL)
2444 return 0;
2445
2446 /* We must restore the old watcher list to be able to close items
2447 * out of it.
2448 */
2449 loop->inotify_watchers = root;
2450
2451 uv__queue_init(&tmp_watcher_list.watchers);
2452 /* Note that the queue we use is shared with the start and stop()
2453 * functions, making uv__queue_foreach unsafe to use. So we use the
2454 * uv__queue_move trick to safely iterate. Also don't free the watcher
2455 * list until we're done iterating. c.f. uv__inotify_read.
2456 */
2457 RB_FOREACH_SAFE(watcher_list, watcher_root,
2458 uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2459 watcher_list->iterating = 1;
2460 uv__queue_move(&watcher_list->watchers, &queue);
2461 while (!uv__queue_empty(&queue)) {
2462 q = uv__queue_head(&queue);
2463 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2464 /* It's critical to keep a copy of path here, because it
2465 * will be set to NULL by stop() and then deallocated by
2466 * maybe_free_watcher_list
2467 */
2468 tmp_path = uv__strdup(handle->path);
2469 assert(tmp_path != NULL);
2470 uv__queue_remove(q);
2471 uv__queue_insert_tail(&watcher_list->watchers, q);
2472 uv_fs_event_stop(handle);
2473
2474 uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2475 handle->path = tmp_path;
2476 }
2477 watcher_list->iterating = 0;
2478 maybe_free_watcher_list(watcher_list, loop);
2479 }
2480
2481 uv__queue_move(&tmp_watcher_list.watchers, &queue);
2482 while (!uv__queue_empty(&queue)) {
2483 q = uv__queue_head(&queue);
2484 uv__queue_remove(q);
2485 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2486 tmp_path = handle->path;
2487 handle->path = NULL;
2488 err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2489 uv__free(tmp_path);
2490 if (err)
2491 return err;
2492 }
2493
2494 return 0;
2495 }
2496
2497
find_watcher(uv_loop_t * loop,int wd)2498 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2499 struct watcher_list w;
2500 w.wd = wd;
2501 return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2502 }
2503
2504
maybe_free_watcher_list(struct watcher_list * w,uv_loop_t * loop)2505 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2506 /* if the watcher_list->watchers is being iterated over, we can't free it. */
2507 if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2508 /* No watchers left for this path. Clean up. */
2509 RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2510 inotify_rm_watch(loop->inotify_fd, w->wd);
2511 uv__free(w);
2512 }
2513 }
2514
2515
uv__inotify_read(uv_loop_t * loop,uv__io_t * dummy,unsigned int events)2516 static void uv__inotify_read(uv_loop_t* loop,
2517 uv__io_t* dummy,
2518 unsigned int events) {
2519 const struct inotify_event* e;
2520 struct watcher_list* w;
2521 uv_fs_event_t* h;
2522 struct uv__queue queue;
2523 struct uv__queue* q;
2524 const char* path;
2525 ssize_t size;
2526 const char *p;
2527 /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2528 char buf[4096];
2529
2530 for (;;) {
2531 do
2532 size = read(loop->inotify_fd, buf, sizeof(buf));
2533 while (size == -1 && errno == EINTR);
2534
2535 if (size == -1) {
2536 assert(errno == EAGAIN || errno == EWOULDBLOCK);
2537 break;
2538 }
2539
2540 assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2541
2542 /* Now we have one or more inotify_event structs. */
2543 for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2544 e = (const struct inotify_event*) p;
2545
2546 events = 0;
2547 if (e->mask & (IN_ATTRIB|IN_MODIFY))
2548 events |= UV_CHANGE;
2549 if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2550 events |= UV_RENAME;
2551
2552 w = find_watcher(loop, e->wd);
2553 if (w == NULL)
2554 continue; /* Stale event, no watchers left. */
2555
2556 /* inotify does not return the filename when monitoring a single file
2557 * for modifications. Repurpose the filename for API compatibility.
2558 * I'm not convinced this is a good thing, maybe it should go.
2559 */
2560 path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2561
2562 /* We're about to iterate over the queue and call user's callbacks.
2563 * What can go wrong?
2564 * A callback could call uv_fs_event_stop()
2565 * and the queue can change under our feet.
2566 * So, we use uv__queue_move() trick to safely iterate over the queue.
2567 * And we don't free the watcher_list until we're done iterating.
2568 *
2569 * First,
2570 * tell uv_fs_event_stop() (that could be called from a user's callback)
2571 * not to free watcher_list.
2572 */
2573 w->iterating = 1;
2574 uv__queue_move(&w->watchers, &queue);
2575 while (!uv__queue_empty(&queue)) {
2576 q = uv__queue_head(&queue);
2577 h = uv__queue_data(q, uv_fs_event_t, watchers);
2578
2579 uv__queue_remove(q);
2580 uv__queue_insert_tail(&w->watchers, q);
2581
2582 h->cb(h, path, events, 0);
2583 }
2584 /* done iterating, time to (maybe) free empty watcher_list */
2585 w->iterating = 0;
2586 maybe_free_watcher_list(w, loop);
2587 }
2588 }
2589 }
2590
2591
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)2592 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2593 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2594 return 0;
2595 }
2596
2597
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)2598 int uv_fs_event_start(uv_fs_event_t* handle,
2599 uv_fs_event_cb cb,
2600 const char* path,
2601 unsigned int flags) {
2602 struct watcher_list* w;
2603 uv_loop_t* loop;
2604 size_t len;
2605 int events;
2606 int err;
2607 int wd;
2608
2609 if (uv__is_active(handle))
2610 return UV_EINVAL;
2611
2612 loop = handle->loop;
2613
2614 err = init_inotify(loop);
2615 if (err)
2616 return err;
2617
2618 events = IN_ATTRIB
2619 | IN_CREATE
2620 | IN_MODIFY
2621 | IN_DELETE
2622 | IN_DELETE_SELF
2623 | IN_MOVE_SELF
2624 | IN_MOVED_FROM
2625 | IN_MOVED_TO;
2626
2627 wd = inotify_add_watch(loop->inotify_fd, path, events);
2628 if (wd == -1)
2629 return UV__ERR(errno);
2630
2631 w = find_watcher(loop, wd);
2632 if (w)
2633 goto no_insert;
2634
2635 len = strlen(path) + 1;
2636 w = uv__malloc(sizeof(*w) + len);
2637 if (w == NULL)
2638 return UV_ENOMEM;
2639
2640 w->wd = wd;
2641 w->path = memcpy(w + 1, path, len);
2642 uv__queue_init(&w->watchers);
2643 w->iterating = 0;
2644 RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2645
2646 no_insert:
2647 uv__handle_start(handle);
2648 uv__queue_insert_tail(&w->watchers, &handle->watchers);
2649 handle->path = w->path;
2650 handle->cb = cb;
2651 handle->wd = wd;
2652
2653 return 0;
2654 }
2655
2656
uv_fs_event_stop(uv_fs_event_t * handle)2657 int uv_fs_event_stop(uv_fs_event_t* handle) {
2658 struct watcher_list* w;
2659
2660 if (!uv__is_active(handle))
2661 return 0;
2662
2663 w = find_watcher(handle->loop, handle->wd);
2664 assert(w != NULL);
2665
2666 handle->wd = -1;
2667 handle->path = NULL;
2668 uv__handle_stop(handle);
2669 uv__queue_remove(&handle->watchers);
2670
2671 maybe_free_watcher_list(w, handle->loop);
2672
2673 return 0;
2674 }
2675
2676
uv__fs_event_close(uv_fs_event_t * handle)2677 void uv__fs_event_close(uv_fs_event_t* handle) {
2678 uv_fs_event_stop(handle);
2679 }
2680