1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2 * Permission is hereby granted, free of charge, to any person obtaining a copy
3 * of this software and associated documentation files (the "Software"), to
4 * deal in the Software without restriction, including without limitation the
5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6 * sell copies of the Software, and to permit persons to whom the Software is
7 * furnished to do so, subject to the following conditions:
8 *
9 * The above copyright notice and this permission notice shall be included in
10 * all copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18 * IN THE SOFTWARE.
19 */
20
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22 * EPOLL* counterparts. We use the POLL* variants in this file because that
23 * is what libuv uses elsewhere.
24 */
25
26 #include "uv.h"
27 #include "internal.h"
28
29 #include <inttypes.h>
30 #include <stdatomic.h>
31 #include <stddef.h> /* offsetof */
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <errno.h>
38
39 #include <fcntl.h>
40 #include <ifaddrs.h>
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <netpacket/packet.h>
44 #include <sys/epoll.h>
45 #include <sys/inotify.h>
46 #include <sys/mman.h>
47 #include <sys/param.h>
48 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/stat.h>
51 #include <sys/syscall.h>
52 #include <sys/sysinfo.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/utsname.h>
56 #include <time.h>
57 #include <unistd.h>
58
59 #ifndef __NR_io_uring_setup
60 # define __NR_io_uring_setup 425
61 #endif
62
63 #ifndef __NR_io_uring_enter
64 # define __NR_io_uring_enter 426
65 #endif
66
67 #ifndef __NR_io_uring_register
68 # define __NR_io_uring_register 427
69 #endif
70
71 #ifndef __NR_copy_file_range
72 # if defined(__x86_64__)
73 # define __NR_copy_file_range 326
74 # elif defined(__i386__)
75 # define __NR_copy_file_range 377
76 # elif defined(__s390__)
77 # define __NR_copy_file_range 375
78 # elif defined(__arm__)
79 # define __NR_copy_file_range 391
80 # elif defined(__aarch64__)
81 # define __NR_copy_file_range 285
82 # elif defined(__powerpc__)
83 # define __NR_copy_file_range 379
84 # elif defined(__arc__)
85 # define __NR_copy_file_range 285
86 # elif defined(__riscv)
87 # define __NR_copy_file_range 285
88 # endif
89 #endif /* __NR_copy_file_range */
90
91 #ifndef __NR_statx
92 # if defined(__x86_64__)
93 # define __NR_statx 332
94 # elif defined(__i386__)
95 # define __NR_statx 383
96 # elif defined(__aarch64__)
97 # define __NR_statx 397
98 # elif defined(__arm__)
99 # define __NR_statx 397
100 # elif defined(__ppc__)
101 # define __NR_statx 383
102 # elif defined(__s390__)
103 # define __NR_statx 379
104 # elif defined(__riscv)
105 # define __NR_statx 291
106 # endif
107 #endif /* __NR_statx */
108
109 #ifndef __NR_getrandom
110 # if defined(__x86_64__)
111 # define __NR_getrandom 318
112 # elif defined(__i386__)
113 # define __NR_getrandom 355
114 # elif defined(__aarch64__)
115 # define __NR_getrandom 384
116 # elif defined(__arm__)
117 # define __NR_getrandom 384
118 # elif defined(__ppc__)
119 # define __NR_getrandom 359
120 # elif defined(__s390__)
121 # define __NR_getrandom 349
122 # elif defined(__riscv)
123 # define __NR_getrandom 278
124 # endif
125 #endif /* __NR_getrandom */
126
127 enum {
128 UV__IORING_SETUP_SQPOLL = 2u,
129 UV__IORING_SETUP_NO_SQARRAY = 0x10000u,
130 };
131
132 enum {
133 UV__IORING_FEAT_SINGLE_MMAP = 1u,
134 UV__IORING_FEAT_NODROP = 2u,
135 UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */
136 };
137
138 enum {
139 UV__IORING_OP_READV = 1,
140 UV__IORING_OP_WRITEV = 2,
141 UV__IORING_OP_FSYNC = 3,
142 UV__IORING_OP_OPENAT = 18,
143 UV__IORING_OP_CLOSE = 19,
144 UV__IORING_OP_STATX = 21,
145 UV__IORING_OP_EPOLL_CTL = 29,
146 UV__IORING_OP_RENAMEAT = 35,
147 UV__IORING_OP_UNLINKAT = 36,
148 UV__IORING_OP_MKDIRAT = 37,
149 UV__IORING_OP_SYMLINKAT = 38,
150 UV__IORING_OP_LINKAT = 39,
151 UV__IORING_OP_FTRUNCATE = 55,
152 };
153
154 enum {
155 UV__IORING_ENTER_GETEVENTS = 1u,
156 UV__IORING_ENTER_SQ_WAKEUP = 2u,
157 };
158
159 enum {
160 UV__IORING_SQ_NEED_WAKEUP = 1u,
161 UV__IORING_SQ_CQ_OVERFLOW = 2u,
162 };
163
164 struct uv__io_cqring_offsets {
165 uint32_t head;
166 uint32_t tail;
167 uint32_t ring_mask;
168 uint32_t ring_entries;
169 uint32_t overflow;
170 uint32_t cqes;
171 uint64_t reserved0;
172 uint64_t reserved1;
173 };
174
175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
176
177 struct uv__io_sqring_offsets {
178 uint32_t head;
179 uint32_t tail;
180 uint32_t ring_mask;
181 uint32_t ring_entries;
182 uint32_t flags;
183 uint32_t dropped;
184 uint32_t array;
185 uint32_t reserved0;
186 uint64_t reserved1;
187 };
188
189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
190
191 struct uv__io_uring_cqe {
192 uint64_t user_data;
193 int32_t res;
194 uint32_t flags;
195 };
196
197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
198
199 struct uv__io_uring_sqe {
200 uint8_t opcode;
201 uint8_t flags;
202 uint16_t ioprio;
203 int32_t fd;
204 union {
205 uint64_t off;
206 uint64_t addr2;
207 };
208 union {
209 uint64_t addr;
210 };
211 uint32_t len;
212 union {
213 uint32_t rw_flags;
214 uint32_t fsync_flags;
215 uint32_t open_flags;
216 uint32_t statx_flags;
217 };
218 uint64_t user_data;
219 union {
220 uint16_t buf_index;
221 uint64_t pad[3];
222 };
223 };
224
225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
236
237 struct uv__io_uring_params {
238 uint32_t sq_entries;
239 uint32_t cq_entries;
240 uint32_t flags;
241 uint32_t sq_thread_cpu;
242 uint32_t sq_thread_idle;
243 uint32_t features;
244 uint32_t reserved[4];
245 struct uv__io_sqring_offsets sq_off; /* 40 bytes */
246 struct uv__io_cqring_offsets cq_off; /* 40 bytes */
247 };
248
249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
252
253 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
254 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
255 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
256
257 struct watcher_list {
258 RB_ENTRY(watcher_list) entry;
259 struct uv__queue watchers;
260 int iterating;
261 char* path;
262 int wd;
263 };
264
265 struct watcher_root {
266 struct watcher_list* rbh_root;
267 };
268
269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
270 static void uv__inotify_read(uv_loop_t* loop,
271 uv__io_t* w,
272 unsigned int revents);
273 static int compare_watchers(const struct watcher_list* a,
274 const struct watcher_list* b);
275 static void maybe_free_watcher_list(struct watcher_list* w,
276 uv_loop_t* loop);
277
278 static void uv__epoll_ctl_flush(int epollfd,
279 struct uv__iou* ctl,
280 struct epoll_event (*events)[256]);
281
282 static void uv__epoll_ctl_prep(int epollfd,
283 struct uv__iou* ctl,
284 struct epoll_event (*events)[256],
285 int op,
286 int fd,
287 struct epoll_event* e);
288
RB_GENERATE_STATIC(watcher_root,watcher_list,entry,compare_watchers)289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
290
291
292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
293 /* This cast works because watcher_root is a struct with a pointer as its
294 * sole member. Such type punning is unsafe in the presence of strict
295 * pointer aliasing (and is just plain nasty) but that is why libuv
296 * is compiled with -fno-strict-aliasing.
297 */
298 return (struct watcher_root*) &loop->inotify_watchers;
299 }
300
301
uv__kernel_version(void)302 unsigned uv__kernel_version(void) {
303 static _Atomic unsigned cached_version;
304 struct utsname u;
305 unsigned version;
306 unsigned major;
307 unsigned minor;
308 unsigned patch;
309 char v_sig[256];
310 char* needle;
311
312 version = atomic_load_explicit(&cached_version, memory_order_relaxed);
313 if (version != 0)
314 return version;
315
316 /* Check /proc/version_signature first as it's the way to get the mainline
317 * kernel version in Ubuntu. The format is:
318 * Ubuntu ubuntu_kernel_version mainline_kernel_version
319 * For example:
320 * Ubuntu 5.15.0-79.86-generic 5.15.111
321 */
322 if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
323 if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
324 goto calculate_version;
325
326 if (-1 == uname(&u))
327 return 0;
328
329 /* In Debian we need to check `version` instead of `release` to extract the
330 * mainline kernel version. This is an example of how it looks like:
331 * #1 SMP Debian 5.10.46-4 (2021-08-03)
332 */
333 needle = strstr(u.version, "Debian ");
334 if (needle != NULL)
335 if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
336 goto calculate_version;
337
338 if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
339 return 0;
340
341 /* Handle it when the process runs under the UNAME26 personality:
342 *
343 * - kernels >= 3.x identify as 2.6.40+x
344 * - kernels >= 4.x identify as 2.6.60+x
345 *
346 * UNAME26 is a poorly conceived hack that doesn't let us distinguish
347 * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
348 * that 2.6.60+x means 4.x.
349 *
350 * Fun fact of the day: it's technically possible to observe the actual
351 * kernel version for a brief moment because uname() first copies out the
352 * real release string before overwriting it with the backcompat string.
353 */
354 if (major == 2 && minor == 6) {
355 if (patch >= 60) {
356 major = 4;
357 minor = patch - 60;
358 patch = 0;
359 } else if (patch >= 40) {
360 major = 3;
361 minor = patch - 40;
362 patch = 0;
363 }
364 }
365
366 calculate_version:
367 version = major * 65536 + minor * 256 + patch;
368 atomic_store_explicit(&cached_version, version, memory_order_relaxed);
369
370 return version;
371 }
372
373
374 ssize_t
uv__fs_copy_file_range(int fd_in,off_t * off_in,int fd_out,off_t * off_out,size_t len,unsigned int flags)375 uv__fs_copy_file_range(int fd_in,
376 off_t* off_in,
377 int fd_out,
378 off_t* off_out,
379 size_t len,
380 unsigned int flags)
381 {
382 #ifdef __NR_copy_file_range
383 return syscall(__NR_copy_file_range,
384 fd_in,
385 off_in,
386 fd_out,
387 off_out,
388 len,
389 flags);
390 #else
391 return errno = ENOSYS, -1;
392 #endif
393 }
394
395
uv__statx(int dirfd,const char * path,int flags,unsigned int mask,struct uv__statx * statxbuf)396 int uv__statx(int dirfd,
397 const char* path,
398 int flags,
399 unsigned int mask,
400 struct uv__statx* statxbuf) {
401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
402 return errno = ENOSYS, -1;
403 #else
404 int rc;
405
406 rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
407 if (rc >= 0)
408 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
409
410 return rc;
411 #endif
412 }
413
414
uv__getrandom(void * buf,size_t buflen,unsigned flags)415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
417 return errno = ENOSYS, -1;
418 #else
419 ssize_t rc;
420
421 rc = syscall(__NR_getrandom, buf, buflen, flags);
422 if (rc >= 0)
423 uv__msan_unpoison(buf, buflen);
424
425 return rc;
426 #endif
427 }
428
429
uv__io_uring_setup(int entries,struct uv__io_uring_params * params)430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
431 return syscall(__NR_io_uring_setup, entries, params);
432 }
433
434
uv__io_uring_enter(int fd,unsigned to_submit,unsigned min_complete,unsigned flags)435 int uv__io_uring_enter(int fd,
436 unsigned to_submit,
437 unsigned min_complete,
438 unsigned flags) {
439 /* io_uring_enter used to take a sigset_t but it's unused
440 * in newer kernels unless IORING_ENTER_EXT_ARG is set,
441 * in which case it takes a struct io_uring_getevents_arg.
442 */
443 return syscall(__NR_io_uring_enter,
444 fd,
445 to_submit,
446 min_complete,
447 flags,
448 NULL,
449 0L);
450 }
451
452
uv__io_uring_register(int fd,unsigned opcode,void * arg,unsigned nargs)453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
454 return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
455 }
456
457
uv__use_io_uring(void)458 static int uv__use_io_uring(void) {
459 #if defined(__ANDROID_API__)
460 return 0; /* Possibly available but blocked by seccomp. */
461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
462 /* See https://github.com/libuv/libuv/issues/4158. */
463 return 0; /* All 32 bits kernels appear buggy. */
464 #elif defined(__powerpc64__) || defined(__ppc64__)
465 /* See https://github.com/libuv/libuv/issues/4283. */
466 return 0; /* Random SIGSEGV in signal handler. */
467 #else
468 /* Ternary: unknown=0, yes=1, no=-1 */
469 static _Atomic int use_io_uring;
470 char* val;
471 int use;
472
473 use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
474
475 if (use == 0) {
476 use = uv__kernel_version() >=
477 #if defined(__hppa__)
478 /* io_uring first supported on parisc in 6.1, functional in .51 */
479 /* https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/ */
480 /* 6.1.51 */ 0x060133
481 #else
482 /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
483 /* 5.10.186 */ 0x050ABA
484 #endif
485 ? 1 : -1;
486
487 /* But users can still enable it if they so desire. */
488 val = getenv("UV_USE_IO_URING");
489 if (val != NULL)
490 use = atoi(val) ? 1 : -1;
491
492 atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
493 }
494
495 return use > 0;
496 #endif
497 }
498
499
uv__iou_init(int epollfd,struct uv__iou * iou,uint32_t entries,uint32_t flags)500 static void uv__iou_init(int epollfd,
501 struct uv__iou* iou,
502 uint32_t entries,
503 uint32_t flags) {
504 struct uv__io_uring_params params;
505 struct epoll_event e;
506 size_t cqlen;
507 size_t sqlen;
508 size_t maxlen;
509 size_t sqelen;
510 unsigned kernel_version;
511 uint32_t* sqarray;
512 uint32_t i;
513 char* sq;
514 char* sqe;
515 int ringfd;
516 int no_sqarray;
517
518 sq = MAP_FAILED;
519 sqe = MAP_FAILED;
520
521 if (!uv__use_io_uring())
522 return;
523
524 kernel_version = uv__kernel_version();
525 no_sqarray =
526 UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600);
527
528 /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
529 * Mostly academic because we check for a v5.13 kernel afterwards anyway.
530 */
531 memset(¶ms, 0, sizeof(params));
532 params.flags = flags | no_sqarray;
533
534 if (flags & UV__IORING_SETUP_SQPOLL)
535 params.sq_thread_idle = 10; /* milliseconds */
536
537 /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
538 ringfd = uv__io_uring_setup(entries, ¶ms);
539 if (ringfd == -1)
540 return;
541
542 /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
543 * actually detecting is whether IORING_OP_STATX works with SQPOLL.
544 */
545 if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
546 goto fail;
547
548 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
549 if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
550 goto fail;
551
552 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
553 if (!(params.features & UV__IORING_FEAT_NODROP))
554 goto fail;
555
556 sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
557 cqlen =
558 params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
559 maxlen = sqlen < cqlen ? cqlen : sqlen;
560 sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
561
562 sq = mmap(0,
563 maxlen,
564 PROT_READ | PROT_WRITE,
565 MAP_SHARED | MAP_POPULATE,
566 ringfd,
567 0); /* IORING_OFF_SQ_RING */
568
569 sqe = mmap(0,
570 sqelen,
571 PROT_READ | PROT_WRITE,
572 MAP_SHARED | MAP_POPULATE,
573 ringfd,
574 0x10000000ull); /* IORING_OFF_SQES */
575
576 if (sq == MAP_FAILED || sqe == MAP_FAILED)
577 goto fail;
578
579 if (flags & UV__IORING_SETUP_SQPOLL) {
580 /* Only interested in completion events. To get notified when
581 * the kernel pulls items from the submission ring, add POLLOUT.
582 */
583 memset(&e, 0, sizeof(e));
584 e.events = POLLIN;
585 e.data.fd = ringfd;
586
587 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
588 goto fail;
589 }
590
591 iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
592 iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
593 iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
594 iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
595 iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
596 iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
597 iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
598 iou->sq = sq;
599 iou->cqe = sq + params.cq_off.cqes;
600 iou->sqe = sqe;
601 iou->sqlen = sqlen;
602 iou->cqlen = cqlen;
603 iou->maxlen = maxlen;
604 iou->sqelen = sqelen;
605 iou->ringfd = ringfd;
606 iou->in_flight = 0;
607
608 if (no_sqarray)
609 return;
610
611 sqarray = (uint32_t*) (sq + params.sq_off.array);
612 for (i = 0; i <= iou->sqmask; i++)
613 sqarray[i] = i; /* Slot -> sqe identity mapping. */
614
615 return;
616
617 fail:
618 if (sq != MAP_FAILED)
619 munmap(sq, maxlen);
620
621 if (sqe != MAP_FAILED)
622 munmap(sqe, sqelen);
623
624 uv__close(ringfd);
625 }
626
627
uv__iou_delete(struct uv__iou * iou)628 static void uv__iou_delete(struct uv__iou* iou) {
629 if (iou->ringfd > -1) {
630 munmap(iou->sq, iou->maxlen);
631 munmap(iou->sqe, iou->sqelen);
632 uv__close(iou->ringfd);
633 iou->ringfd = -1;
634 }
635 }
636
637
uv__platform_loop_init(uv_loop_t * loop)638 int uv__platform_loop_init(uv_loop_t* loop) {
639 uv__loop_internal_fields_t* lfields;
640
641 lfields = uv__get_internal_fields(loop);
642 lfields->ctl.ringfd = -1;
643 lfields->iou.ringfd = -2; /* "uninitialized" */
644
645 loop->inotify_watchers = NULL;
646 loop->inotify_fd = -1;
647 loop->backend_fd = epoll_create1(O_CLOEXEC);
648
649 if (loop->backend_fd == -1)
650 return UV__ERR(errno);
651
652 uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
653
654 return 0;
655 }
656
657
uv__io_fork(uv_loop_t * loop)658 int uv__io_fork(uv_loop_t* loop) {
659 int err;
660 struct watcher_list* root;
661
662 root = uv__inotify_watchers(loop)->rbh_root;
663
664 uv__close(loop->backend_fd);
665 loop->backend_fd = -1;
666
667 /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
668 uv__platform_loop_delete(loop);
669
670 err = uv__platform_loop_init(loop);
671 if (err)
672 return err;
673
674 return uv__inotify_fork(loop, root);
675 }
676
677
uv__platform_loop_delete(uv_loop_t * loop)678 void uv__platform_loop_delete(uv_loop_t* loop) {
679 uv__loop_internal_fields_t* lfields;
680
681 lfields = uv__get_internal_fields(loop);
682 uv__iou_delete(&lfields->ctl);
683 uv__iou_delete(&lfields->iou);
684
685 if (loop->inotify_fd != -1) {
686 uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
687 uv__close(loop->inotify_fd);
688 loop->inotify_fd = -1;
689 }
690 }
691
692
693 struct uv__invalidate {
694 struct epoll_event (*prep)[256];
695 struct epoll_event* events;
696 int nfds;
697 };
698
699
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)700 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
701 uv__loop_internal_fields_t* lfields;
702 struct uv__invalidate* inv;
703 struct epoll_event dummy;
704 int i;
705
706 lfields = uv__get_internal_fields(loop);
707 inv = lfields->inv;
708
709 /* Invalidate events with same file descriptor */
710 if (inv != NULL)
711 for (i = 0; i < inv->nfds; i++)
712 if (inv->events[i].data.fd == fd)
713 inv->events[i].data.fd = -1;
714
715 /* Remove the file descriptor from the epoll.
716 * This avoids a problem where the same file description remains open
717 * in another process, causing repeated junk epoll events.
718 *
719 * Perform EPOLL_CTL_DEL immediately instead of going through
720 * io_uring's submit queue, otherwise the file descriptor may
721 * be closed by the time the kernel starts the operation.
722 *
723 * We pass in a dummy epoll_event, to work around a bug in old kernels.
724 *
725 * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
726 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
727 */
728 memset(&dummy, 0, sizeof(dummy));
729 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
730 }
731
732
uv__io_check_fd(uv_loop_t * loop,int fd)733 int uv__io_check_fd(uv_loop_t* loop, int fd) {
734 struct epoll_event e;
735 int rc;
736
737 memset(&e, 0, sizeof(e));
738 e.events = POLLIN;
739 e.data.fd = -1;
740
741 rc = 0;
742 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
743 if (errno != EEXIST)
744 rc = UV__ERR(errno);
745
746 if (rc == 0)
747 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
748 abort();
749
750 return rc;
751 }
752
753
754 /* Caller must initialize SQE and call uv__iou_submit(). */
uv__iou_get_sqe(struct uv__iou * iou,uv_loop_t * loop,uv_fs_t * req)755 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
756 uv_loop_t* loop,
757 uv_fs_t* req) {
758 struct uv__io_uring_sqe* sqe;
759 uint32_t head;
760 uint32_t tail;
761 uint32_t mask;
762 uint32_t slot;
763
764 /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
765 * initialization failed. Anything else is a valid ring file descriptor.
766 */
767 if (iou->ringfd == -2) {
768 /* By default, the SQPOLL is not created. Enable only if the loop is
769 * configured with UV_LOOP_USE_IO_URING_SQPOLL.
770 */
771 if ((loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL) == 0) {
772 iou->ringfd = -1;
773 return NULL;
774 }
775
776 uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
777 if (iou->ringfd == -2)
778 iou->ringfd = -1; /* "failed" */
779 }
780
781 if (iou->ringfd == -1)
782 return NULL;
783
784 head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
785 memory_order_acquire);
786 tail = *iou->sqtail;
787 mask = iou->sqmask;
788
789 if ((head & mask) == ((tail + 1) & mask))
790 return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
791
792 slot = tail & mask;
793 sqe = iou->sqe;
794 sqe = &sqe[slot];
795 memset(sqe, 0, sizeof(*sqe));
796 sqe->user_data = (uintptr_t) req;
797
798 /* Pacify uv_cancel(). */
799 req->work_req.loop = loop;
800 req->work_req.work = NULL;
801 req->work_req.done = NULL;
802 uv__queue_init(&req->work_req.wq);
803
804 uv__req_register(loop);
805 iou->in_flight++;
806
807 return sqe;
808 }
809
810
uv__iou_submit(struct uv__iou * iou)811 static void uv__iou_submit(struct uv__iou* iou) {
812 uint32_t flags;
813
814 atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
815 *iou->sqtail + 1,
816 memory_order_release);
817
818 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
819 memory_order_acquire);
820
821 if (flags & UV__IORING_SQ_NEED_WAKEUP)
822 if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
823 if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */
824 perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */
825 }
826
827
uv__iou_fs_close(uv_loop_t * loop,uv_fs_t * req)828 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
829 struct uv__io_uring_sqe* sqe;
830 struct uv__iou* iou;
831 int kv;
832
833 kv = uv__kernel_version();
834 /* Work around a poorly understood bug in older kernels where closing a file
835 * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
836 * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
837 * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
838 * but good candidates are the several data race fixes. Interestingly, it
839 * seems to manifest only when running under Docker so the possibility of
840 * a Docker bug can't be completely ruled out either. Yay, computers.
841 * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
842 * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
843 * solved.
844 */
845 if (kv < /* 5.15.90 */ 0x050F5A)
846 return 0;
847
848 if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
849 return 0;
850
851
852 iou = &uv__get_internal_fields(loop)->iou;
853
854 sqe = uv__iou_get_sqe(iou, loop, req);
855 if (sqe == NULL)
856 return 0;
857
858 sqe->fd = req->file;
859 sqe->opcode = UV__IORING_OP_CLOSE;
860
861 uv__iou_submit(iou);
862
863 return 1;
864 }
865
866
uv__iou_fs_ftruncate(uv_loop_t * loop,uv_fs_t * req)867 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) {
868 struct uv__io_uring_sqe* sqe;
869 struct uv__iou* iou;
870
871 if (uv__kernel_version() < /* 6.9 */0x060900)
872 return 0;
873
874 iou = &uv__get_internal_fields(loop)->iou;
875 sqe = uv__iou_get_sqe(iou, loop, req);
876 if (sqe == NULL)
877 return 0;
878
879 sqe->fd = req->file;
880 sqe->len = req->off;
881 sqe->opcode = UV__IORING_OP_FTRUNCATE;
882 uv__iou_submit(iou);
883
884 return 1;
885 }
886
uv__iou_fs_fsync_or_fdatasync(uv_loop_t * loop,uv_fs_t * req,uint32_t fsync_flags)887 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
888 uv_fs_t* req,
889 uint32_t fsync_flags) {
890 struct uv__io_uring_sqe* sqe;
891 struct uv__iou* iou;
892
893 iou = &uv__get_internal_fields(loop)->iou;
894
895 sqe = uv__iou_get_sqe(iou, loop, req);
896 if (sqe == NULL)
897 return 0;
898
899 /* Little known fact: setting seq->off and seq->len turns
900 * it into an asynchronous sync_file_range() operation.
901 */
902 sqe->fd = req->file;
903 sqe->fsync_flags = fsync_flags;
904 sqe->opcode = UV__IORING_OP_FSYNC;
905
906 uv__iou_submit(iou);
907
908 return 1;
909 }
910
911
uv__iou_fs_link(uv_loop_t * loop,uv_fs_t * req)912 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
913 struct uv__io_uring_sqe* sqe;
914 struct uv__iou* iou;
915
916 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
917 return 0;
918
919 iou = &uv__get_internal_fields(loop)->iou;
920 sqe = uv__iou_get_sqe(iou, loop, req);
921 if (sqe == NULL)
922 return 0;
923
924 sqe->addr = (uintptr_t) req->path;
925 sqe->fd = AT_FDCWD;
926 sqe->addr2 = (uintptr_t) req->new_path;
927 sqe->len = AT_FDCWD;
928 sqe->opcode = UV__IORING_OP_LINKAT;
929
930 uv__iou_submit(iou);
931
932 return 1;
933 }
934
935
uv__iou_fs_mkdir(uv_loop_t * loop,uv_fs_t * req)936 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
937 struct uv__io_uring_sqe* sqe;
938 struct uv__iou* iou;
939
940 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
941 return 0;
942
943 iou = &uv__get_internal_fields(loop)->iou;
944 sqe = uv__iou_get_sqe(iou, loop, req);
945 if (sqe == NULL)
946 return 0;
947
948 sqe->addr = (uintptr_t) req->path;
949 sqe->fd = AT_FDCWD;
950 sqe->len = req->mode;
951 sqe->opcode = UV__IORING_OP_MKDIRAT;
952
953 uv__iou_submit(iou);
954
955 return 1;
956 }
957
958
uv__iou_fs_open(uv_loop_t * loop,uv_fs_t * req)959 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
960 struct uv__io_uring_sqe* sqe;
961 struct uv__iou* iou;
962
963 iou = &uv__get_internal_fields(loop)->iou;
964
965 sqe = uv__iou_get_sqe(iou, loop, req);
966 if (sqe == NULL)
967 return 0;
968
969 sqe->addr = (uintptr_t) req->path;
970 sqe->fd = AT_FDCWD;
971 sqe->len = req->mode;
972 sqe->opcode = UV__IORING_OP_OPENAT;
973 sqe->open_flags = req->flags | O_CLOEXEC;
974
975 uv__iou_submit(iou);
976
977 return 1;
978 }
979
980
uv__iou_fs_rename(uv_loop_t * loop,uv_fs_t * req)981 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
982 struct uv__io_uring_sqe* sqe;
983 struct uv__iou* iou;
984
985 iou = &uv__get_internal_fields(loop)->iou;
986
987 sqe = uv__iou_get_sqe(iou, loop, req);
988 if (sqe == NULL)
989 return 0;
990
991 sqe->addr = (uintptr_t) req->path;
992 sqe->fd = AT_FDCWD;
993 sqe->addr2 = (uintptr_t) req->new_path;
994 sqe->len = AT_FDCWD;
995 sqe->opcode = UV__IORING_OP_RENAMEAT;
996
997 uv__iou_submit(iou);
998
999 return 1;
1000 }
1001
1002
uv__iou_fs_symlink(uv_loop_t * loop,uv_fs_t * req)1003 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
1004 struct uv__io_uring_sqe* sqe;
1005 struct uv__iou* iou;
1006
1007 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
1008 return 0;
1009
1010 iou = &uv__get_internal_fields(loop)->iou;
1011 sqe = uv__iou_get_sqe(iou, loop, req);
1012 if (sqe == NULL)
1013 return 0;
1014
1015 sqe->addr = (uintptr_t) req->path;
1016 sqe->fd = AT_FDCWD;
1017 sqe->addr2 = (uintptr_t) req->new_path;
1018 sqe->opcode = UV__IORING_OP_SYMLINKAT;
1019
1020 uv__iou_submit(iou);
1021
1022 return 1;
1023 }
1024
1025
uv__iou_fs_unlink(uv_loop_t * loop,uv_fs_t * req)1026 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
1027 struct uv__io_uring_sqe* sqe;
1028 struct uv__iou* iou;
1029
1030 iou = &uv__get_internal_fields(loop)->iou;
1031
1032 sqe = uv__iou_get_sqe(iou, loop, req);
1033 if (sqe == NULL)
1034 return 0;
1035
1036 sqe->addr = (uintptr_t) req->path;
1037 sqe->fd = AT_FDCWD;
1038 sqe->opcode = UV__IORING_OP_UNLINKAT;
1039
1040 uv__iou_submit(iou);
1041
1042 return 1;
1043 }
1044
1045
uv__iou_fs_read_or_write(uv_loop_t * loop,uv_fs_t * req,int is_read)1046 int uv__iou_fs_read_or_write(uv_loop_t* loop,
1047 uv_fs_t* req,
1048 int is_read) {
1049 struct uv__io_uring_sqe* sqe;
1050 struct uv__iou* iou;
1051
1052 /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1053 * to the threadpool on writes */
1054 if (req->nbufs > IOV_MAX) {
1055 if (is_read)
1056 req->nbufs = IOV_MAX;
1057 else
1058 return 0;
1059 }
1060
1061 iou = &uv__get_internal_fields(loop)->iou;
1062
1063 sqe = uv__iou_get_sqe(iou, loop, req);
1064 if (sqe == NULL)
1065 return 0;
1066
1067 sqe->addr = (uintptr_t) req->bufs;
1068 sqe->fd = req->file;
1069 sqe->len = req->nbufs;
1070 sqe->off = req->off < 0 ? -1 : req->off;
1071 sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1072
1073 uv__iou_submit(iou);
1074
1075 return 1;
1076 }
1077
1078
uv__iou_fs_statx(uv_loop_t * loop,uv_fs_t * req,int is_fstat,int is_lstat)1079 int uv__iou_fs_statx(uv_loop_t* loop,
1080 uv_fs_t* req,
1081 int is_fstat,
1082 int is_lstat) {
1083 struct uv__io_uring_sqe* sqe;
1084 struct uv__statx* statxbuf;
1085 struct uv__iou* iou;
1086
1087 statxbuf = uv__malloc(sizeof(*statxbuf));
1088 if (statxbuf == NULL)
1089 return 0;
1090
1091 iou = &uv__get_internal_fields(loop)->iou;
1092
1093 sqe = uv__iou_get_sqe(iou, loop, req);
1094 if (sqe == NULL) {
1095 uv__free(statxbuf);
1096 return 0;
1097 }
1098
1099 req->ptr = statxbuf;
1100
1101 sqe->addr = (uintptr_t) req->path;
1102 sqe->addr2 = (uintptr_t) statxbuf;
1103 sqe->fd = AT_FDCWD;
1104 sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1105 sqe->opcode = UV__IORING_OP_STATX;
1106
1107 if (is_fstat) {
1108 sqe->addr = (uintptr_t) "";
1109 sqe->fd = req->file;
1110 sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1111 }
1112
1113 if (is_lstat)
1114 sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1115
1116 uv__iou_submit(iou);
1117
1118 return 1;
1119 }
1120
1121
uv__statx_to_stat(const struct uv__statx * statxbuf,uv_stat_t * buf)1122 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1123 buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1124 buf->st_mode = statxbuf->stx_mode;
1125 buf->st_nlink = statxbuf->stx_nlink;
1126 buf->st_uid = statxbuf->stx_uid;
1127 buf->st_gid = statxbuf->stx_gid;
1128 buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1129 buf->st_ino = statxbuf->stx_ino;
1130 buf->st_size = statxbuf->stx_size;
1131 buf->st_blksize = statxbuf->stx_blksize;
1132 buf->st_blocks = statxbuf->stx_blocks;
1133 buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1134 buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1135 buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1136 buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1137 buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1138 buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1139 buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1140 buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1141 buf->st_flags = 0;
1142 buf->st_gen = 0;
1143 }
1144
1145
uv__iou_fs_statx_post(uv_fs_t * req)1146 static void uv__iou_fs_statx_post(uv_fs_t* req) {
1147 struct uv__statx* statxbuf;
1148 uv_stat_t* buf;
1149
1150 buf = &req->statbuf;
1151 statxbuf = req->ptr;
1152 req->ptr = NULL;
1153
1154 if (req->result == 0) {
1155 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1156 uv__statx_to_stat(statxbuf, buf);
1157 req->ptr = buf;
1158 }
1159
1160 uv__free(statxbuf);
1161 }
1162
1163
uv__poll_io_uring(uv_loop_t * loop,struct uv__iou * iou)1164 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1165 struct uv__io_uring_cqe* cqe;
1166 struct uv__io_uring_cqe* e;
1167 uv_fs_t* req;
1168 uint32_t head;
1169 uint32_t tail;
1170 uint32_t mask;
1171 uint32_t i;
1172 uint32_t flags;
1173 int nevents;
1174 int rc;
1175
1176 head = *iou->cqhead;
1177 tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1178 memory_order_acquire);
1179 mask = iou->cqmask;
1180 cqe = iou->cqe;
1181 nevents = 0;
1182
1183 for (i = head; i != tail; i++) {
1184 e = &cqe[i & mask];
1185
1186 req = (uv_fs_t*) (uintptr_t) e->user_data;
1187 assert(req->type == UV_FS);
1188
1189 uv__req_unregister(loop);
1190 iou->in_flight--;
1191
1192 /* If the op is not supported by the kernel retry using the thread pool */
1193 if (e->res == -EOPNOTSUPP) {
1194 uv__fs_post(loop, req);
1195 continue;
1196 }
1197
1198 /* io_uring stores error codes as negative numbers, same as libuv. */
1199 req->result = e->res;
1200
1201 switch (req->fs_type) {
1202 case UV_FS_FSTAT:
1203 case UV_FS_LSTAT:
1204 case UV_FS_STAT:
1205 uv__iou_fs_statx_post(req);
1206 break;
1207 default: /* Squelch -Wswitch warnings. */
1208 break;
1209 }
1210
1211 uv__metrics_update_idle_time(loop);
1212 req->cb(req);
1213 nevents++;
1214 }
1215
1216 atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1217 tail,
1218 memory_order_release);
1219
1220 /* Check whether CQE's overflowed, if so enter the kernel to make them
1221 * available. Don't grab them immediately but in the next loop iteration to
1222 * avoid loop starvation. */
1223 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1224 memory_order_acquire);
1225
1226 if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1227 do
1228 rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1229 while (rc == -1 && errno == EINTR);
1230
1231 if (rc < 0)
1232 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1233 }
1234
1235 uv__metrics_inc_events(loop, nevents);
1236 if (uv__get_internal_fields(loop)->current_timeout == 0)
1237 uv__metrics_inc_events_waiting(loop, nevents);
1238 }
1239
1240
1241 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1242 * executed immediately, otherwise the file descriptor may have been closed
1243 * by the time the kernel starts the operation.
1244 */
uv__epoll_ctl_prep(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256],int op,int fd,struct epoll_event * e)1245 static void uv__epoll_ctl_prep(int epollfd,
1246 struct uv__iou* ctl,
1247 struct epoll_event (*events)[256],
1248 int op,
1249 int fd,
1250 struct epoll_event* e) {
1251 struct uv__io_uring_sqe* sqe;
1252 struct epoll_event* pe;
1253 uint32_t mask;
1254 uint32_t slot;
1255
1256 assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1257 assert(ctl->ringfd != -1);
1258
1259 mask = ctl->sqmask;
1260 slot = (*ctl->sqtail)++ & mask;
1261
1262 pe = &(*events)[slot];
1263 *pe = *e;
1264
1265 sqe = ctl->sqe;
1266 sqe = &sqe[slot];
1267
1268 memset(sqe, 0, sizeof(*sqe));
1269 sqe->addr = (uintptr_t) pe;
1270 sqe->fd = epollfd;
1271 sqe->len = op;
1272 sqe->off = fd;
1273 sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1274 sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1275
1276 if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1277 uv__epoll_ctl_flush(epollfd, ctl, events);
1278 }
1279
1280
uv__epoll_ctl_flush(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256])1281 static void uv__epoll_ctl_flush(int epollfd,
1282 struct uv__iou* ctl,
1283 struct epoll_event (*events)[256]) {
1284 struct epoll_event oldevents[256];
1285 struct uv__io_uring_cqe* cqe;
1286 uint32_t oldslot;
1287 uint32_t slot;
1288 uint32_t n;
1289 int fd;
1290 int op;
1291 int rc;
1292
1293 STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1294 assert(ctl->ringfd != -1);
1295 assert(*ctl->sqhead != *ctl->sqtail);
1296
1297 n = *ctl->sqtail - *ctl->sqhead;
1298 do
1299 rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1300 while (rc == -1 && errno == EINTR);
1301
1302 if (rc < 0)
1303 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1304
1305 if (rc != (int) n)
1306 abort();
1307
1308 assert(*ctl->sqhead == *ctl->sqtail);
1309
1310 memcpy(oldevents, *events, sizeof(*events));
1311
1312 /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1313 * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1314 * that we are already watching. Ignore the former and retry the latter
1315 * with EPOLL_CTL_MOD.
1316 */
1317 while (*ctl->cqhead != *ctl->cqtail) {
1318 slot = (*ctl->cqhead)++ & ctl->cqmask;
1319
1320 cqe = ctl->cqe;
1321 cqe = &cqe[slot];
1322
1323 if (cqe->res == 0)
1324 continue;
1325
1326 fd = cqe->user_data >> 32;
1327 op = 3 & cqe->user_data;
1328 oldslot = 255 & (cqe->user_data >> 2);
1329
1330 if (op == EPOLL_CTL_DEL)
1331 continue;
1332
1333 if (op != EPOLL_CTL_ADD)
1334 abort();
1335
1336 if (cqe->res != -EEXIST)
1337 abort();
1338
1339 uv__epoll_ctl_prep(epollfd,
1340 ctl,
1341 events,
1342 EPOLL_CTL_MOD,
1343 fd,
1344 &oldevents[oldslot]);
1345 }
1346 }
1347
1348
uv__io_poll(uv_loop_t * loop,int timeout)1349 void uv__io_poll(uv_loop_t* loop, int timeout) {
1350 uv__loop_internal_fields_t* lfields;
1351 struct epoll_event events[1024];
1352 struct epoll_event prep[256];
1353 struct uv__invalidate inv;
1354 struct epoll_event* pe;
1355 struct epoll_event e;
1356 struct uv__iou* ctl;
1357 struct uv__iou* iou;
1358 int real_timeout;
1359 struct uv__queue* q;
1360 uv__io_t* w;
1361 sigset_t* sigmask;
1362 sigset_t sigset;
1363 uint64_t base;
1364 int have_iou_events;
1365 int have_signals;
1366 int nevents;
1367 int epollfd;
1368 int count;
1369 int nfds;
1370 int fd;
1371 int op;
1372 int i;
1373 int user_timeout;
1374 int reset_timeout;
1375
1376 lfields = uv__get_internal_fields(loop);
1377 ctl = &lfields->ctl;
1378 iou = &lfields->iou;
1379
1380 sigmask = NULL;
1381 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1382 sigemptyset(&sigset);
1383 sigaddset(&sigset, SIGPROF);
1384 sigmask = &sigset;
1385 }
1386
1387 assert(timeout >= -1);
1388 base = loop->time;
1389 count = 48; /* Benchmarks suggest this gives the best throughput. */
1390 real_timeout = timeout;
1391
1392 if (lfields->flags & UV_METRICS_IDLE_TIME) {
1393 reset_timeout = 1;
1394 user_timeout = timeout;
1395 timeout = 0;
1396 } else {
1397 reset_timeout = 0;
1398 user_timeout = 0;
1399 }
1400
1401 epollfd = loop->backend_fd;
1402
1403 memset(&e, 0, sizeof(e));
1404
1405 while (!uv__queue_empty(&loop->watcher_queue)) {
1406 q = uv__queue_head(&loop->watcher_queue);
1407 w = uv__queue_data(q, uv__io_t, watcher_queue);
1408 uv__queue_remove(q);
1409 uv__queue_init(q);
1410
1411 op = EPOLL_CTL_MOD;
1412 if (w->events == 0)
1413 op = EPOLL_CTL_ADD;
1414
1415 w->events = w->pevents;
1416 e.events = w->pevents;
1417 e.data.fd = w->fd;
1418 fd = w->fd;
1419
1420 if (ctl->ringfd != -1) {
1421 uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1422 continue;
1423 }
1424
1425 if (!epoll_ctl(epollfd, op, fd, &e))
1426 continue;
1427
1428 assert(op == EPOLL_CTL_ADD);
1429 assert(errno == EEXIST);
1430
1431 /* File descriptor that's been watched before, update event mask. */
1432 if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1433 abort();
1434 }
1435
1436 inv.events = events;
1437 inv.prep = &prep;
1438 inv.nfds = -1;
1439
1440 for (;;) {
1441 if (loop->nfds == 0)
1442 if (iou->in_flight == 0)
1443 break;
1444
1445 /* All event mask mutations should be visible to the kernel before
1446 * we enter epoll_pwait().
1447 */
1448 if (ctl->ringfd != -1)
1449 while (*ctl->sqhead != *ctl->sqtail)
1450 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1451
1452 /* Only need to set the provider_entry_time if timeout != 0. The function
1453 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
1454 */
1455 if (timeout != 0)
1456 uv__metrics_set_provider_entry_time(loop);
1457
1458 /* Store the current timeout in a location that's globally accessible so
1459 * other locations like uv__work_done() can determine whether the queue
1460 * of events in the callback were waiting when poll was called.
1461 */
1462 lfields->current_timeout = timeout;
1463
1464 nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1465
1466 /* Update loop->time unconditionally. It's tempting to skip the update when
1467 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
1468 * operating system didn't reschedule our process while in the syscall.
1469 */
1470 SAVE_ERRNO(uv__update_time(loop));
1471
1472 if (nfds == -1)
1473 assert(errno == EINTR);
1474 else if (nfds == 0)
1475 /* Unlimited timeout should only return with events or signal. */
1476 assert(timeout != -1);
1477
1478 if (nfds == 0 || nfds == -1) {
1479 if (reset_timeout != 0) {
1480 timeout = user_timeout;
1481 reset_timeout = 0;
1482 } else if (nfds == 0) {
1483 return;
1484 }
1485
1486 /* Interrupted by a signal. Update timeout and poll again. */
1487 goto update_timeout;
1488 }
1489
1490 have_iou_events = 0;
1491 have_signals = 0;
1492 nevents = 0;
1493
1494 inv.nfds = nfds;
1495 lfields->inv = &inv;
1496
1497 for (i = 0; i < nfds; i++) {
1498 pe = events + i;
1499 fd = pe->data.fd;
1500
1501 /* Skip invalidated events, see uv__platform_invalidate_fd */
1502 if (fd == -1)
1503 continue;
1504
1505 if (fd == iou->ringfd) {
1506 uv__poll_io_uring(loop, iou);
1507 have_iou_events = 1;
1508 continue;
1509 }
1510
1511 assert(fd >= 0);
1512 assert((unsigned) fd < loop->nwatchers);
1513
1514 w = loop->watchers[fd];
1515
1516 if (w == NULL) {
1517 /* File descriptor that we've stopped watching, disarm it.
1518 *
1519 * Ignore all errors because we may be racing with another thread
1520 * when the file descriptor is closed.
1521 *
1522 * Perform EPOLL_CTL_DEL immediately instead of going through
1523 * io_uring's submit queue, otherwise the file descriptor may
1524 * be closed by the time the kernel starts the operation.
1525 */
1526 epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1527 continue;
1528 }
1529
1530 /* Give users only events they're interested in. Prevents spurious
1531 * callbacks when previous callback invocation in this loop has stopped
1532 * the current watcher. Also, filters out events that users has not
1533 * requested us to watch.
1534 */
1535 pe->events &= w->pevents | POLLERR | POLLHUP;
1536
1537 /* Work around an epoll quirk where it sometimes reports just the
1538 * EPOLLERR or EPOLLHUP event. In order to force the event loop to
1539 * move forward, we merge in the read/write events that the watcher
1540 * is interested in; uv__read() and uv__write() will then deal with
1541 * the error or hangup in the usual fashion.
1542 *
1543 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1544 * reads the available data, calls uv_read_stop(), then sometime later
1545 * calls uv_read_start() again. By then, libuv has forgotten about the
1546 * hangup and the kernel won't report EPOLLIN again because there's
1547 * nothing left to read. If anything, libuv is to blame here. The
1548 * current hack is just a quick bandaid; to properly fix it, libuv
1549 * needs to remember the error/hangup event. We should get that for
1550 * free when we switch over to edge-triggered I/O.
1551 */
1552 if (pe->events == POLLERR || pe->events == POLLHUP)
1553 pe->events |=
1554 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1555
1556 if (pe->events != 0) {
1557 /* Run signal watchers last. This also affects child process watchers
1558 * because those are implemented in terms of signal watchers.
1559 */
1560 if (w == &loop->signal_io_watcher) {
1561 have_signals = 1;
1562 } else {
1563 uv__metrics_update_idle_time(loop);
1564 w->cb(loop, w, pe->events);
1565 }
1566
1567 nevents++;
1568 }
1569 }
1570
1571 uv__metrics_inc_events(loop, nevents);
1572 if (reset_timeout != 0) {
1573 timeout = user_timeout;
1574 reset_timeout = 0;
1575 uv__metrics_inc_events_waiting(loop, nevents);
1576 }
1577
1578 if (have_signals != 0) {
1579 uv__metrics_update_idle_time(loop);
1580 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
1581 }
1582
1583 lfields->inv = NULL;
1584
1585 if (have_iou_events != 0)
1586 break; /* Event loop should cycle now so don't poll again. */
1587
1588 if (have_signals != 0)
1589 break; /* Event loop should cycle now so don't poll again. */
1590
1591 if (nevents != 0) {
1592 if (nfds == ARRAY_SIZE(events) && --count != 0) {
1593 /* Poll for more events but don't block this time. */
1594 timeout = 0;
1595 continue;
1596 }
1597 break;
1598 }
1599
1600 update_timeout:
1601 if (timeout == 0)
1602 break;
1603
1604 if (timeout == -1)
1605 continue;
1606
1607 assert(timeout > 0);
1608
1609 real_timeout -= (loop->time - base);
1610 if (real_timeout <= 0)
1611 break;
1612
1613 timeout = real_timeout;
1614 }
1615
1616 if (ctl->ringfd != -1)
1617 while (*ctl->sqhead != *ctl->sqtail)
1618 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1619 }
1620
uv__hrtime(uv_clocktype_t type)1621 uint64_t uv__hrtime(uv_clocktype_t type) {
1622 static _Atomic clock_t fast_clock_id = -1;
1623 struct timespec t;
1624 clock_t clock_id;
1625
1626 /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1627 * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is
1628 * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1629 * decide to make a costly system call.
1630 */
1631 /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1632 * when it has microsecond granularity or better (unlikely).
1633 */
1634 clock_id = CLOCK_MONOTONIC;
1635 if (type != UV_CLOCK_FAST)
1636 goto done;
1637
1638 clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1639 if (clock_id != -1)
1640 goto done;
1641
1642 clock_id = CLOCK_MONOTONIC;
1643 if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1644 if (t.tv_nsec <= 1 * 1000 * 1000)
1645 clock_id = CLOCK_MONOTONIC_COARSE;
1646
1647 atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1648
1649 done:
1650
1651 if (clock_gettime(clock_id, &t))
1652 return 0; /* Not really possible. */
1653
1654 return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1655 }
1656
1657
uv_resident_set_memory(size_t * rss)1658 int uv_resident_set_memory(size_t* rss) {
1659 char buf[1024];
1660 const char* s;
1661 long val;
1662 int rc;
1663 int i;
1664
1665 /* rss: 24th element */
1666 rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1667 if (rc < 0)
1668 return rc;
1669
1670 /* find the last ')' */
1671 s = strrchr(buf, ')');
1672 if (s == NULL)
1673 goto err;
1674
1675 for (i = 1; i <= 22; i++) {
1676 s = strchr(s + 1, ' ');
1677 if (s == NULL)
1678 goto err;
1679 }
1680
1681 errno = 0;
1682 val = strtol(s, NULL, 10);
1683 if (val < 0 || errno != 0)
1684 goto err;
1685
1686 *rss = val * getpagesize();
1687 return 0;
1688
1689 err:
1690 return UV_EINVAL;
1691 }
1692
uv_uptime(double * uptime)1693 int uv_uptime(double* uptime) {
1694 struct timespec now;
1695 char buf[128];
1696
1697 /* Consult /proc/uptime when present (common case), or fall back to
1698 * clock_gettime. Why not always clock_gettime? It doesn't always return the
1699 * right result under OpenVZ and possibly other containerized environments.
1700 */
1701 if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1702 if (1 == sscanf(buf, "%lf", uptime))
1703 return 0;
1704
1705 if (clock_gettime(CLOCK_BOOTTIME, &now))
1706 return UV__ERR(errno);
1707
1708 *uptime = now.tv_sec;
1709 return 0;
1710 }
1711
1712
uv_cpu_info(uv_cpu_info_t ** ci,int * count)1713 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1714 #if defined(__PPC__)
1715 static const char model_marker[] = "cpu\t\t: ";
1716 #elif defined(__arm__)
1717 static const char model_marker[] = "Processor\t: ";
1718 #elif defined(__aarch64__)
1719 static const char model_marker[] = "CPU part\t: ";
1720 #elif defined(__mips__)
1721 static const char model_marker[] = "cpu model\t\t: ";
1722 #elif defined(__loongarch__)
1723 static const char model_marker[] = "cpu family\t\t: ";
1724 #else
1725 static const char model_marker[] = "model name\t: ";
1726 #endif
1727 static const char parts[] =
1728 #ifdef __aarch64__
1729 "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n"
1730 "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n"
1731 "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n"
1732 "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1733 "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n"
1734 "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n"
1735 "0xc0d\nCortex-A17\n" /* Originally A12 */
1736 "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n"
1737 "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n"
1738 "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n"
1739 "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n"
1740 "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n"
1741 "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n"
1742 "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n"
1743 "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n"
1744 "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n"
1745 "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n"
1746 "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n"
1747 #endif
1748 "";
1749 struct cpu {
1750 unsigned long long freq, user, nice, sys, idle, irq;
1751 unsigned model;
1752 };
1753 FILE* fp;
1754 char* p;
1755 int found;
1756 int n;
1757 unsigned i;
1758 unsigned cpu;
1759 unsigned maxcpu;
1760 unsigned size;
1761 unsigned long long skip;
1762 struct cpu (*cpus)[8192]; /* Kernel maximum. */
1763 struct cpu* c;
1764 struct cpu t;
1765 char (*model)[64];
1766 unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1767 /* Assumption: even big.LITTLE systems will have only a handful
1768 * of different CPU models. Most systems will just have one.
1769 */
1770 char models[8][64];
1771 char buf[1024];
1772
1773 memset(bitmap, 0, sizeof(bitmap));
1774 memset(models, 0, sizeof(models));
1775 snprintf(*models, sizeof(*models), "unknown");
1776 maxcpu = 0;
1777
1778 cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1779 if (cpus == NULL)
1780 return UV_ENOMEM;
1781
1782 fp = uv__open_file("/proc/stat");
1783 if (fp == NULL) {
1784 uv__free(cpus);
1785 return UV__ERR(errno);
1786 }
1787
1788 if (NULL == fgets(buf, sizeof(buf), fp))
1789 abort();
1790
1791 for (;;) {
1792 memset(&t, 0, sizeof(t));
1793
1794 n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1795 &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1796
1797 if (n != 7)
1798 break;
1799
1800 if (NULL == fgets(buf, sizeof(buf), fp))
1801 abort();
1802
1803 if (cpu >= ARRAY_SIZE(*cpus))
1804 continue;
1805
1806 (*cpus)[cpu] = t;
1807
1808 bitmap[cpu >> 3] |= 1 << (cpu & 7);
1809
1810 if (cpu >= maxcpu)
1811 maxcpu = cpu + 1;
1812 }
1813
1814 fclose(fp);
1815
1816 fp = uv__open_file("/proc/cpuinfo");
1817 if (fp == NULL)
1818 goto nocpuinfo;
1819
1820 for (;;) {
1821 if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1822 break; /* Parse error. */
1823
1824 found = 0;
1825 while (!found && fgets(buf, sizeof(buf), fp))
1826 found = !strncmp(buf, model_marker, sizeof(model_marker) - 1);
1827
1828 if (!found)
1829 goto next;
1830
1831 p = buf + sizeof(model_marker) - 1;
1832 n = (int) strcspn(p, "\n");
1833
1834 /* arm64: translate CPU part code to model name. */
1835 if (*parts) {
1836 p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1837 if (p == NULL)
1838 p = "unknown";
1839 else
1840 p += n + 1;
1841 n = (int) strcspn(p, "\n");
1842 }
1843
1844 found = 0;
1845 for (model = models; !found && model < ARRAY_END(models); model++)
1846 found = !strncmp(p, *model, strlen(*model));
1847
1848 if (!found)
1849 goto next;
1850
1851 if (**model == '\0')
1852 snprintf(*model, sizeof(*model), "%.*s", n, p);
1853
1854 if (cpu < maxcpu)
1855 (*cpus)[cpu].model = model - models;
1856
1857 next:
1858 while (fgets(buf, sizeof(buf), fp))
1859 if (*buf == '\n')
1860 break;
1861 }
1862
1863 fclose(fp);
1864 fp = NULL;
1865
1866 nocpuinfo:
1867
1868 n = 0;
1869 for (cpu = 0; cpu < maxcpu; cpu++) {
1870 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1871 continue;
1872
1873 n++;
1874 snprintf(buf, sizeof(buf),
1875 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1876
1877 fp = uv__open_file(buf);
1878 if (fp == NULL)
1879 continue;
1880
1881 if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1882 abort();
1883 fclose(fp);
1884 fp = NULL;
1885 }
1886
1887 size = n * sizeof(**ci) + sizeof(models);
1888 *ci = uv__malloc(size);
1889 *count = 0;
1890
1891 if (*ci == NULL) {
1892 uv__free(cpus);
1893 return UV_ENOMEM;
1894 }
1895
1896 *count = n;
1897 p = memcpy(*ci + n, models, sizeof(models));
1898
1899 i = 0;
1900 for (cpu = 0; cpu < maxcpu; cpu++) {
1901 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1902 continue;
1903
1904 c = *cpus + cpu;
1905
1906 (*ci)[i++] = (uv_cpu_info_t) {
1907 .model = p + c->model * sizeof(*model),
1908 .speed = c->freq / 1000,
1909 /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1910 * therefore the multiplier is always 1000/100 = 10.
1911 */
1912 .cpu_times = (struct uv_cpu_times_s) {
1913 .user = 10 * c->user,
1914 .nice = 10 * c->nice,
1915 .sys = 10 * c->sys,
1916 .idle = 10 * c->idle,
1917 .irq = 10 * c->irq,
1918 },
1919 };
1920 }
1921
1922 uv__free(cpus);
1923
1924 return 0;
1925 }
1926
1927
uv__ifaddr_exclude(struct ifaddrs * ent,int exclude_type)1928 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1929 if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1930 return 1;
1931 if (ent->ifa_addr == NULL)
1932 return 1;
1933 /*
1934 * On Linux getifaddrs returns information related to the raw underlying
1935 * devices. We're not interested in this information yet.
1936 */
1937 if (ent->ifa_addr->sa_family == PF_PACKET)
1938 return exclude_type;
1939 return !exclude_type;
1940 }
1941
uv_interface_addresses(uv_interface_address_t ** addresses,int * count)1942 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1943 struct ifaddrs *addrs, *ent;
1944 uv_interface_address_t* address;
1945 int i;
1946 struct sockaddr_ll *sll;
1947
1948 *count = 0;
1949 *addresses = NULL;
1950
1951 if (getifaddrs(&addrs))
1952 return UV__ERR(errno);
1953
1954 /* Count the number of interfaces */
1955 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1956 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1957 continue;
1958
1959 (*count)++;
1960 }
1961
1962 if (*count == 0) {
1963 freeifaddrs(addrs);
1964 return 0;
1965 }
1966
1967 /* Make sure the memory is initiallized to zero using calloc() */
1968 *addresses = uv__calloc(*count, sizeof(**addresses));
1969 if (!(*addresses)) {
1970 freeifaddrs(addrs);
1971 return UV_ENOMEM;
1972 }
1973
1974 address = *addresses;
1975
1976 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1977 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1978 continue;
1979
1980 address->name = uv__strdup(ent->ifa_name);
1981
1982 if (ent->ifa_addr->sa_family == AF_INET6) {
1983 address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
1984 } else {
1985 address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
1986 }
1987
1988 if (ent->ifa_netmask->sa_family == AF_INET6) {
1989 address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
1990 } else {
1991 address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
1992 }
1993
1994 address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
1995
1996 address++;
1997 }
1998
1999 /* Fill in physical addresses for each interface */
2000 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
2001 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
2002 continue;
2003
2004 address = *addresses;
2005
2006 for (i = 0; i < (*count); i++) {
2007 size_t namelen = strlen(ent->ifa_name);
2008 /* Alias interface share the same physical address */
2009 if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
2010 (address->name[namelen] == 0 || address->name[namelen] == ':')) {
2011 sll = (struct sockaddr_ll*)ent->ifa_addr;
2012 memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
2013 }
2014 address++;
2015 }
2016 }
2017
2018 freeifaddrs(addrs);
2019
2020 return 0;
2021 }
2022
2023
uv_free_interface_addresses(uv_interface_address_t * addresses,int count)2024 void uv_free_interface_addresses(uv_interface_address_t* addresses,
2025 int count) {
2026 int i;
2027
2028 for (i = 0; i < count; i++) {
2029 uv__free(addresses[i].name);
2030 }
2031
2032 uv__free(addresses);
2033 }
2034
2035
uv__set_process_title(const char * title)2036 void uv__set_process_title(const char* title) {
2037 #if defined(PR_SET_NAME)
2038 prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */
2039 #endif
2040 }
2041
2042
uv__read_proc_meminfo(const char * what)2043 static uint64_t uv__read_proc_meminfo(const char* what) {
2044 uint64_t rc;
2045 char* p;
2046 char buf[4096]; /* Large enough to hold all of /proc/meminfo. */
2047
2048 if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2049 return 0;
2050
2051 p = strstr(buf, what);
2052
2053 if (p == NULL)
2054 return 0;
2055
2056 p += strlen(what);
2057
2058 rc = 0;
2059 sscanf(p, "%" PRIu64 " kB", &rc);
2060
2061 return rc * 1024;
2062 }
2063
2064
uv_get_free_memory(void)2065 uint64_t uv_get_free_memory(void) {
2066 struct sysinfo info;
2067 uint64_t rc;
2068
2069 rc = uv__read_proc_meminfo("MemAvailable:");
2070
2071 if (rc != 0)
2072 return rc;
2073
2074 if (0 == sysinfo(&info))
2075 return (uint64_t) info.freeram * info.mem_unit;
2076
2077 return 0;
2078 }
2079
2080
uv_get_total_memory(void)2081 uint64_t uv_get_total_memory(void) {
2082 struct sysinfo info;
2083 uint64_t rc;
2084
2085 rc = uv__read_proc_meminfo("MemTotal:");
2086
2087 if (rc != 0)
2088 return rc;
2089
2090 if (0 == sysinfo(&info))
2091 return (uint64_t) info.totalram * info.mem_unit;
2092
2093 return 0;
2094 }
2095
2096
uv__read_uint64(const char * filename)2097 static uint64_t uv__read_uint64(const char* filename) {
2098 char buf[32]; /* Large enough to hold an encoded uint64_t. */
2099 uint64_t rc;
2100
2101 rc = 0;
2102 if (0 == uv__slurp(filename, buf, sizeof(buf)))
2103 if (1 != sscanf(buf, "%" PRIu64, &rc))
2104 if (0 == strcmp(buf, "max\n"))
2105 rc = UINT64_MAX;
2106
2107 return rc;
2108 }
2109
2110
2111 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2112 * finds the location and length of the memory controller mount path.
2113 * This disregards the leading / for easy concatenation of paths.
2114 * Returns NULL if the memory controller wasn't found. */
uv__cgroup1_find_memory_controller(char buf[static1024],int * n)2115 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2116 int* n) {
2117 char* p;
2118
2119 /* Seek to the memory controller line. */
2120 p = strchr(buf, ':');
2121 while (p != NULL && strncmp(p, ":memory:", 8)) {
2122 p = strchr(p, '\n');
2123 if (p != NULL)
2124 p = strchr(p, ':');
2125 }
2126
2127 if (p != NULL) {
2128 /* Determine the length of the mount path. */
2129 p = p + strlen(":memory:/");
2130 *n = (int) strcspn(p, "\n");
2131 }
2132
2133 return p;
2134 }
2135
uv__get_cgroup1_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2136 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2137 uint64_t* max) {
2138 char filename[4097];
2139 char* p;
2140 int n;
2141 uint64_t cgroup1_max;
2142
2143 /* Find out where the controller is mounted. */
2144 p = uv__cgroup1_find_memory_controller(buf, &n);
2145 if (p != NULL) {
2146 snprintf(filename, sizeof(filename),
2147 "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2148 *high = uv__read_uint64(filename);
2149
2150 snprintf(filename, sizeof(filename),
2151 "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2152 *max = uv__read_uint64(filename);
2153
2154 /* If the controller wasn't mounted, the reads above will have failed,
2155 * as indicated by uv__read_uint64 returning 0.
2156 */
2157 if (*high != 0 && *max != 0)
2158 goto update_limits;
2159 }
2160
2161 /* Fall back to the limits of the global memory controller. */
2162 *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2163 *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2164
2165 /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2166 * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2167 */
2168 update_limits:
2169 cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2170 if (*high == cgroup1_max)
2171 *high = UINT64_MAX;
2172 if (*max == cgroup1_max)
2173 *max = UINT64_MAX;
2174 }
2175
uv__get_cgroup2_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2176 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2177 uint64_t* max) {
2178 char filename[4097];
2179 char* p;
2180 int n;
2181
2182 /* Find out where the controller is mounted. */
2183 p = buf + strlen("0::/");
2184 n = (int) strcspn(p, "\n");
2185
2186 /* Read the memory limits of the controller. */
2187 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2188 *max = uv__read_uint64(filename);
2189 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2190 *high = uv__read_uint64(filename);
2191 }
2192
uv__get_cgroup_constrained_memory(char buf[static1024])2193 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2194 uint64_t high;
2195 uint64_t max;
2196
2197 /* In the case of cgroupv2, we'll only have a single entry. */
2198 if (strncmp(buf, "0::/", 4))
2199 uv__get_cgroup1_memory_limits(buf, &high, &max);
2200 else
2201 uv__get_cgroup2_memory_limits(buf, &high, &max);
2202
2203 if (high == 0 || max == 0)
2204 return 0;
2205
2206 return high < max ? high : max;
2207 }
2208
uv_get_constrained_memory(void)2209 uint64_t uv_get_constrained_memory(void) {
2210 char buf[1024];
2211
2212 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2213 return 0;
2214
2215 return uv__get_cgroup_constrained_memory(buf);
2216 }
2217
2218
uv__get_cgroup1_current_memory(char buf[static1024])2219 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2220 char filename[4097];
2221 uint64_t current;
2222 char* p;
2223 int n;
2224
2225 /* Find out where the controller is mounted. */
2226 p = uv__cgroup1_find_memory_controller(buf, &n);
2227 if (p != NULL) {
2228 snprintf(filename, sizeof(filename),
2229 "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2230 current = uv__read_uint64(filename);
2231
2232 /* If the controller wasn't mounted, the reads above will have failed,
2233 * as indicated by uv__read_uint64 returning 0.
2234 */
2235 if (current != 0)
2236 return current;
2237 }
2238
2239 /* Fall back to the usage of the global memory controller. */
2240 return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2241 }
2242
uv__get_cgroup2_current_memory(char buf[static1024])2243 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2244 char filename[4097];
2245 char* p;
2246 int n;
2247
2248 /* Find out where the controller is mounted. */
2249 p = buf + strlen("0::/");
2250 n = (int) strcspn(p, "\n");
2251
2252 snprintf(filename, sizeof(filename),
2253 "/sys/fs/cgroup/%.*s/memory.current", n, p);
2254 return uv__read_uint64(filename);
2255 }
2256
uv_get_available_memory(void)2257 uint64_t uv_get_available_memory(void) {
2258 char buf[1024];
2259 uint64_t constrained;
2260 uint64_t current;
2261 uint64_t total;
2262
2263 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2264 return 0;
2265
2266 constrained = uv__get_cgroup_constrained_memory(buf);
2267 if (constrained == 0)
2268 return uv_get_free_memory();
2269
2270 total = uv_get_total_memory();
2271 if (constrained > total)
2272 return uv_get_free_memory();
2273
2274 /* In the case of cgroupv2, we'll only have a single entry. */
2275 if (strncmp(buf, "0::/", 4))
2276 current = uv__get_cgroup1_current_memory(buf);
2277 else
2278 current = uv__get_cgroup2_current_memory(buf);
2279
2280 /* memory usage can be higher than the limit (for short bursts of time) */
2281 if (constrained < current)
2282 return 0;
2283
2284 return constrained - current;
2285 }
2286
2287
uv__get_cgroupv2_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2288 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2289 uv__cpu_constraint* constraint) {
2290 char path[256];
2291 char buf[1024];
2292 unsigned int weight;
2293 int cgroup_size;
2294 const char* cgroup_trimmed;
2295 char quota_buf[16];
2296
2297 if (strncmp(cgroup, "0::/", 4) != 0)
2298 return UV_EINVAL;
2299
2300 /* Trim ending \n by replacing it with a 0 */
2301 cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */
2302 cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first slash */
2303
2304 /* Construct the path to the cpu.max file */
2305 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.max", cgroup_size,
2306 cgroup_trimmed);
2307
2308 /* Read cpu.max */
2309 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2310 return UV_EIO;
2311
2312 if (sscanf(buf, "%15s %llu", quota_buf, &constraint->period_length) != 2)
2313 return UV_EINVAL;
2314
2315 if (strncmp(quota_buf, "max", 3) == 0)
2316 constraint->quota_per_period = LLONG_MAX;
2317 else if (sscanf(quota_buf, "%lld", &constraint->quota_per_period) != 1)
2318 return UV_EINVAL; // conversion failed
2319
2320 /* Construct the path to the cpu.weight file */
2321 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.weight", cgroup_size,
2322 cgroup_trimmed);
2323
2324 /* Read cpu.weight */
2325 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2326 return UV_EIO;
2327
2328 if (sscanf(buf, "%u", &weight) != 1)
2329 return UV_EINVAL;
2330
2331 constraint->proportions = (double)weight / 100.0;
2332
2333 return 0;
2334 }
2335
uv__cgroup1_find_cpu_controller(const char * cgroup,int * cgroup_size)2336 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2337 int* cgroup_size) {
2338 /* Seek to the cpu controller line. */
2339 char* cgroup_cpu = strstr(cgroup, ":cpu,");
2340
2341 if (cgroup_cpu != NULL) {
2342 /* Skip the controller prefix to the start of the cgroup path. */
2343 cgroup_cpu += sizeof(":cpu,") - 1;
2344 /* Determine the length of the cgroup path, excluding the newline. */
2345 *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2346 }
2347
2348 return cgroup_cpu;
2349 }
2350
uv__get_cgroupv1_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2351 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2352 uv__cpu_constraint* constraint) {
2353 char path[256];
2354 char buf[1024];
2355 unsigned int shares;
2356 int cgroup_size;
2357 char* cgroup_cpu;
2358
2359 cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2360
2361 if (cgroup_cpu == NULL)
2362 return UV_EIO;
2363
2364 /* Construct the path to the cpu.cfs_quota_us file */
2365 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2366 cgroup_size, cgroup_cpu);
2367
2368 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2369 return UV_EIO;
2370
2371 if (sscanf(buf, "%lld", &constraint->quota_per_period) != 1)
2372 return UV_EINVAL;
2373
2374 /* Construct the path to the cpu.cfs_period_us file */
2375 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2376 cgroup_size, cgroup_cpu);
2377
2378 /* Read cpu.cfs_period_us */
2379 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2380 return UV_EIO;
2381
2382 if (sscanf(buf, "%lld", &constraint->period_length) != 1)
2383 return UV_EINVAL;
2384
2385 /* Construct the path to the cpu.shares file */
2386 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.shares", cgroup_size,
2387 cgroup_cpu);
2388
2389 /* Read cpu.shares */
2390 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2391 return UV_EIO;
2392
2393 if (sscanf(buf, "%u", &shares) != 1)
2394 return UV_EINVAL;
2395
2396 constraint->proportions = (double)shares / 1024.0;
2397
2398 return 0;
2399 }
2400
uv__get_constrained_cpu(uv__cpu_constraint * constraint)2401 int uv__get_constrained_cpu(uv__cpu_constraint* constraint) {
2402 char cgroup[1024];
2403
2404 /* Read the cgroup from /proc/self/cgroup */
2405 if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2406 return UV_EIO;
2407
2408 /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2409 * The entry for cgroup v2 is always in the format "0::$PATH"
2410 * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2411 if (strncmp(cgroup, "0::/", 4) == 0)
2412 return uv__get_cgroupv2_constrained_cpu(cgroup, constraint);
2413 else
2414 return uv__get_cgroupv1_constrained_cpu(cgroup, constraint);
2415 }
2416
2417
uv_loadavg(double avg[3])2418 void uv_loadavg(double avg[3]) {
2419 struct sysinfo info;
2420 char buf[128]; /* Large enough to hold all of /proc/loadavg. */
2421
2422 if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2423 if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2424 return;
2425
2426 if (sysinfo(&info) < 0)
2427 return;
2428
2429 avg[0] = (double) info.loads[0] / 65536.0;
2430 avg[1] = (double) info.loads[1] / 65536.0;
2431 avg[2] = (double) info.loads[2] / 65536.0;
2432 }
2433
2434
compare_watchers(const struct watcher_list * a,const struct watcher_list * b)2435 static int compare_watchers(const struct watcher_list* a,
2436 const struct watcher_list* b) {
2437 if (a->wd < b->wd) return -1;
2438 if (a->wd > b->wd) return 1;
2439 return 0;
2440 }
2441
2442
init_inotify(uv_loop_t * loop)2443 static int init_inotify(uv_loop_t* loop) {
2444 int fd;
2445
2446 if (loop->inotify_fd != -1)
2447 return 0;
2448
2449 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2450 if (fd < 0)
2451 return UV__ERR(errno);
2452
2453 loop->inotify_fd = fd;
2454 uv__io_init(&loop->inotify_read_watcher, uv__inotify_read, loop->inotify_fd);
2455 uv__io_start(loop, &loop->inotify_read_watcher, POLLIN);
2456
2457 return 0;
2458 }
2459
2460
uv__inotify_fork(uv_loop_t * loop,struct watcher_list * root)2461 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2462 /* Open the inotify_fd, and re-arm all the inotify watchers. */
2463 int err;
2464 struct watcher_list* tmp_watcher_list_iter;
2465 struct watcher_list* watcher_list;
2466 struct watcher_list tmp_watcher_list;
2467 struct uv__queue queue;
2468 struct uv__queue* q;
2469 uv_fs_event_t* handle;
2470 char* tmp_path;
2471
2472 if (root == NULL)
2473 return 0;
2474
2475 /* We must restore the old watcher list to be able to close items
2476 * out of it.
2477 */
2478 loop->inotify_watchers = root;
2479
2480 uv__queue_init(&tmp_watcher_list.watchers);
2481 /* Note that the queue we use is shared with the start and stop()
2482 * functions, making uv__queue_foreach unsafe to use. So we use the
2483 * uv__queue_move trick to safely iterate. Also don't free the watcher
2484 * list until we're done iterating. c.f. uv__inotify_read.
2485 */
2486 RB_FOREACH_SAFE(watcher_list, watcher_root,
2487 uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2488 watcher_list->iterating = 1;
2489 uv__queue_move(&watcher_list->watchers, &queue);
2490 while (!uv__queue_empty(&queue)) {
2491 q = uv__queue_head(&queue);
2492 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2493 /* It's critical to keep a copy of path here, because it
2494 * will be set to NULL by stop() and then deallocated by
2495 * maybe_free_watcher_list
2496 */
2497 tmp_path = uv__strdup(handle->path);
2498 assert(tmp_path != NULL);
2499 uv__queue_remove(q);
2500 uv__queue_insert_tail(&watcher_list->watchers, q);
2501 uv_fs_event_stop(handle);
2502
2503 uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2504 handle->path = tmp_path;
2505 }
2506 watcher_list->iterating = 0;
2507 maybe_free_watcher_list(watcher_list, loop);
2508 }
2509
2510 uv__queue_move(&tmp_watcher_list.watchers, &queue);
2511 while (!uv__queue_empty(&queue)) {
2512 q = uv__queue_head(&queue);
2513 uv__queue_remove(q);
2514 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2515 tmp_path = handle->path;
2516 handle->path = NULL;
2517 err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2518 uv__free(tmp_path);
2519 if (err)
2520 return err;
2521 }
2522
2523 return 0;
2524 }
2525
2526
find_watcher(uv_loop_t * loop,int wd)2527 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2528 struct watcher_list w;
2529 w.wd = wd;
2530 return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2531 }
2532
2533
maybe_free_watcher_list(struct watcher_list * w,uv_loop_t * loop)2534 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2535 /* if the watcher_list->watchers is being iterated over, we can't free it. */
2536 if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2537 /* No watchers left for this path. Clean up. */
2538 RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2539 inotify_rm_watch(loop->inotify_fd, w->wd);
2540 uv__free(w);
2541 }
2542 }
2543
2544
uv__inotify_read(uv_loop_t * loop,uv__io_t * dummy,unsigned int events)2545 static void uv__inotify_read(uv_loop_t* loop,
2546 uv__io_t* dummy,
2547 unsigned int events) {
2548 const struct inotify_event* e;
2549 struct watcher_list* w;
2550 uv_fs_event_t* h;
2551 struct uv__queue queue;
2552 struct uv__queue* q;
2553 const char* path;
2554 ssize_t size;
2555 const char *p;
2556 /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2557 char buf[4096];
2558
2559 for (;;) {
2560 do
2561 size = read(loop->inotify_fd, buf, sizeof(buf));
2562 while (size == -1 && errno == EINTR);
2563
2564 if (size == -1) {
2565 assert(errno == EAGAIN || errno == EWOULDBLOCK);
2566 break;
2567 }
2568
2569 assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2570
2571 /* Now we have one or more inotify_event structs. */
2572 for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2573 e = (const struct inotify_event*) p;
2574
2575 events = 0;
2576 if (e->mask & (IN_ATTRIB|IN_MODIFY))
2577 events |= UV_CHANGE;
2578 if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2579 events |= UV_RENAME;
2580
2581 w = find_watcher(loop, e->wd);
2582 if (w == NULL)
2583 continue; /* Stale event, no watchers left. */
2584
2585 /* inotify does not return the filename when monitoring a single file
2586 * for modifications. Repurpose the filename for API compatibility.
2587 * I'm not convinced this is a good thing, maybe it should go.
2588 */
2589 path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2590
2591 /* We're about to iterate over the queue and call user's callbacks.
2592 * What can go wrong?
2593 * A callback could call uv_fs_event_stop()
2594 * and the queue can change under our feet.
2595 * So, we use uv__queue_move() trick to safely iterate over the queue.
2596 * And we don't free the watcher_list until we're done iterating.
2597 *
2598 * First,
2599 * tell uv_fs_event_stop() (that could be called from a user's callback)
2600 * not to free watcher_list.
2601 */
2602 w->iterating = 1;
2603 uv__queue_move(&w->watchers, &queue);
2604 while (!uv__queue_empty(&queue)) {
2605 q = uv__queue_head(&queue);
2606 h = uv__queue_data(q, uv_fs_event_t, watchers);
2607
2608 uv__queue_remove(q);
2609 uv__queue_insert_tail(&w->watchers, q);
2610
2611 h->cb(h, path, events, 0);
2612 }
2613 /* done iterating, time to (maybe) free empty watcher_list */
2614 w->iterating = 0;
2615 maybe_free_watcher_list(w, loop);
2616 }
2617 }
2618 }
2619
2620
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)2621 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2622 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2623 return 0;
2624 }
2625
2626
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)2627 int uv_fs_event_start(uv_fs_event_t* handle,
2628 uv_fs_event_cb cb,
2629 const char* path,
2630 unsigned int flags) {
2631 struct watcher_list* w;
2632 uv_loop_t* loop;
2633 size_t len;
2634 int events;
2635 int err;
2636 int wd;
2637
2638 if (uv__is_active(handle))
2639 return UV_EINVAL;
2640
2641 loop = handle->loop;
2642
2643 err = init_inotify(loop);
2644 if (err)
2645 return err;
2646
2647 events = IN_ATTRIB
2648 | IN_CREATE
2649 | IN_MODIFY
2650 | IN_DELETE
2651 | IN_DELETE_SELF
2652 | IN_MOVE_SELF
2653 | IN_MOVED_FROM
2654 | IN_MOVED_TO;
2655
2656 wd = inotify_add_watch(loop->inotify_fd, path, events);
2657 if (wd == -1)
2658 return UV__ERR(errno);
2659
2660 w = find_watcher(loop, wd);
2661 if (w)
2662 goto no_insert;
2663
2664 len = strlen(path) + 1;
2665 w = uv__malloc(sizeof(*w) + len);
2666 if (w == NULL)
2667 return UV_ENOMEM;
2668
2669 w->wd = wd;
2670 w->path = memcpy(w + 1, path, len);
2671 uv__queue_init(&w->watchers);
2672 w->iterating = 0;
2673 RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2674
2675 no_insert:
2676 uv__handle_start(handle);
2677 uv__queue_insert_tail(&w->watchers, &handle->watchers);
2678 handle->path = w->path;
2679 handle->cb = cb;
2680 handle->wd = wd;
2681
2682 return 0;
2683 }
2684
2685
uv_fs_event_stop(uv_fs_event_t * handle)2686 int uv_fs_event_stop(uv_fs_event_t* handle) {
2687 struct watcher_list* w;
2688
2689 if (!uv__is_active(handle))
2690 return 0;
2691
2692 w = find_watcher(handle->loop, handle->wd);
2693 assert(w != NULL);
2694
2695 handle->wd = -1;
2696 handle->path = NULL;
2697 uv__handle_stop(handle);
2698 uv__queue_remove(&handle->watchers);
2699
2700 maybe_free_watcher_list(w, handle->loop);
2701
2702 return 0;
2703 }
2704
2705
uv__fs_event_close(uv_fs_event_t * handle)2706 void uv__fs_event_close(uv_fs_event_t* handle) {
2707 uv_fs_event_stop(handle);
2708 }
2709