1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2 * Permission is hereby granted, free of charge, to any person obtaining a copy
3 * of this software and associated documentation files (the "Software"), to
4 * deal in the Software without restriction, including without limitation the
5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6 * sell copies of the Software, and to permit persons to whom the Software is
7 * furnished to do so, subject to the following conditions:
8 *
9 * The above copyright notice and this permission notice shall be included in
10 * all copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18 * IN THE SOFTWARE.
19 */
20
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22 * EPOLL* counterparts. We use the POLL* variants in this file because that
23 * is what libuv uses elsewhere.
24 */
25
26 #include "uv.h"
27 #include "internal.h"
28
29 #include <inttypes.h>
30 #include <stdatomic.h>
31 #include <stddef.h> /* offsetof */
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <errno.h>
38
39 #include <fcntl.h>
40 #include <ifaddrs.h>
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <netpacket/packet.h>
44 #include <sys/epoll.h>
45 #include <sys/inotify.h>
46 #include <sys/mman.h>
47 #include <sys/param.h>
48 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/stat.h>
51 #include <sys/syscall.h>
52 #include <sys/sysinfo.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/utsname.h>
56 #include <time.h>
57 #include <unistd.h>
58
59 #ifndef __NR_io_uring_setup
60 # define __NR_io_uring_setup 425
61 #endif
62
63 #ifndef __NR_io_uring_enter
64 # define __NR_io_uring_enter 426
65 #endif
66
67 #ifndef __NR_io_uring_register
68 # define __NR_io_uring_register 427
69 #endif
70
71 #ifndef __NR_copy_file_range
72 # if defined(__x86_64__)
73 # define __NR_copy_file_range 326
74 # elif defined(__i386__)
75 # define __NR_copy_file_range 377
76 # elif defined(__s390__)
77 # define __NR_copy_file_range 375
78 # elif defined(__arm__)
79 # define __NR_copy_file_range 391
80 # elif defined(__aarch64__)
81 # define __NR_copy_file_range 285
82 # elif defined(__powerpc__)
83 # define __NR_copy_file_range 379
84 # elif defined(__arc__)
85 # define __NR_copy_file_range 285
86 # elif defined(__riscv)
87 # define __NR_copy_file_range 285
88 # endif
89 #endif /* __NR_copy_file_range */
90
91 #ifndef __NR_statx
92 # if defined(__x86_64__)
93 # define __NR_statx 332
94 # elif defined(__i386__)
95 # define __NR_statx 383
96 # elif defined(__aarch64__)
97 # define __NR_statx 397
98 # elif defined(__arm__)
99 # define __NR_statx 397
100 # elif defined(__ppc__)
101 # define __NR_statx 383
102 # elif defined(__s390__)
103 # define __NR_statx 379
104 # elif defined(__riscv)
105 # define __NR_statx 291
106 # endif
107 #endif /* __NR_statx */
108
109 #ifndef __NR_getrandom
110 # if defined(__x86_64__)
111 # define __NR_getrandom 318
112 # elif defined(__i386__)
113 # define __NR_getrandom 355
114 # elif defined(__aarch64__)
115 # define __NR_getrandom 384
116 # elif defined(__arm__)
117 # define __NR_getrandom 384
118 # elif defined(__ppc__)
119 # define __NR_getrandom 359
120 # elif defined(__s390__)
121 # define __NR_getrandom 349
122 # elif defined(__riscv)
123 # define __NR_getrandom 278
124 # endif
125 #endif /* __NR_getrandom */
126
127 enum {
128 UV__IORING_SETUP_SQPOLL = 2u,
129 UV__IORING_SETUP_NO_SQARRAY = 0x10000u,
130 };
131
132 enum {
133 UV__IORING_FEAT_SINGLE_MMAP = 1u,
134 UV__IORING_FEAT_NODROP = 2u,
135 UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */
136 };
137
138 enum {
139 UV__IORING_OP_READV = 1,
140 UV__IORING_OP_WRITEV = 2,
141 UV__IORING_OP_FSYNC = 3,
142 UV__IORING_OP_OPENAT = 18,
143 UV__IORING_OP_CLOSE = 19,
144 UV__IORING_OP_STATX = 21,
145 UV__IORING_OP_EPOLL_CTL = 29,
146 UV__IORING_OP_RENAMEAT = 35,
147 UV__IORING_OP_UNLINKAT = 36,
148 UV__IORING_OP_MKDIRAT = 37,
149 UV__IORING_OP_SYMLINKAT = 38,
150 UV__IORING_OP_LINKAT = 39,
151 UV__IORING_OP_FTRUNCATE = 55,
152 };
153
154 enum {
155 UV__IORING_ENTER_GETEVENTS = 1u,
156 UV__IORING_ENTER_SQ_WAKEUP = 2u,
157 };
158
159 enum {
160 UV__IORING_SQ_NEED_WAKEUP = 1u,
161 UV__IORING_SQ_CQ_OVERFLOW = 2u,
162 };
163
164 struct uv__io_cqring_offsets {
165 uint32_t head;
166 uint32_t tail;
167 uint32_t ring_mask;
168 uint32_t ring_entries;
169 uint32_t overflow;
170 uint32_t cqes;
171 uint64_t reserved0;
172 uint64_t reserved1;
173 };
174
175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
176
177 struct uv__io_sqring_offsets {
178 uint32_t head;
179 uint32_t tail;
180 uint32_t ring_mask;
181 uint32_t ring_entries;
182 uint32_t flags;
183 uint32_t dropped;
184 uint32_t array;
185 uint32_t reserved0;
186 uint64_t reserved1;
187 };
188
189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
190
191 struct uv__io_uring_cqe {
192 uint64_t user_data;
193 int32_t res;
194 uint32_t flags;
195 };
196
197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
198
199 struct uv__io_uring_sqe {
200 uint8_t opcode;
201 uint8_t flags;
202 uint16_t ioprio;
203 int32_t fd;
204 union {
205 uint64_t off;
206 uint64_t addr2;
207 };
208 union {
209 uint64_t addr;
210 };
211 uint32_t len;
212 union {
213 uint32_t rw_flags;
214 uint32_t fsync_flags;
215 uint32_t open_flags;
216 uint32_t statx_flags;
217 };
218 uint64_t user_data;
219 union {
220 uint16_t buf_index;
221 uint64_t pad[3];
222 };
223 };
224
225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
236
237 struct uv__io_uring_params {
238 uint32_t sq_entries;
239 uint32_t cq_entries;
240 uint32_t flags;
241 uint32_t sq_thread_cpu;
242 uint32_t sq_thread_idle;
243 uint32_t features;
244 uint32_t reserved[4];
245 struct uv__io_sqring_offsets sq_off; /* 40 bytes */
246 struct uv__io_cqring_offsets cq_off; /* 40 bytes */
247 };
248
249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
252
253 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
254 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
255 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
256
257 struct watcher_list {
258 RB_ENTRY(watcher_list) entry;
259 struct uv__queue watchers;
260 int iterating;
261 char* path;
262 int wd;
263 };
264
265 struct watcher_root {
266 struct watcher_list* rbh_root;
267 };
268
269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
270 static void uv__inotify_read(uv_loop_t* loop,
271 uv__io_t* w,
272 unsigned int revents);
273 static int compare_watchers(const struct watcher_list* a,
274 const struct watcher_list* b);
275 static void maybe_free_watcher_list(struct watcher_list* w,
276 uv_loop_t* loop);
277
278 static void uv__epoll_ctl_flush(int epollfd,
279 struct uv__iou* ctl,
280 struct epoll_event (*events)[256]);
281
282 static void uv__epoll_ctl_prep(int epollfd,
283 struct uv__iou* ctl,
284 struct epoll_event (*events)[256],
285 int op,
286 int fd,
287 struct epoll_event* e);
288
RB_GENERATE_STATIC(watcher_root,watcher_list,entry,compare_watchers)289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
290
291
292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
293 /* This cast works because watcher_root is a struct with a pointer as its
294 * sole member. Such type punning is unsafe in the presence of strict
295 * pointer aliasing (and is just plain nasty) but that is why libuv
296 * is compiled with -fno-strict-aliasing.
297 */
298 return (struct watcher_root*) &loop->inotify_watchers;
299 }
300
301
uv__kernel_version(void)302 unsigned uv__kernel_version(void) {
303 static _Atomic unsigned cached_version;
304 struct utsname u;
305 unsigned version;
306 unsigned major;
307 unsigned minor;
308 unsigned patch;
309 char v_sig[256];
310 char* needle;
311
312 version = atomic_load_explicit(&cached_version, memory_order_relaxed);
313 if (version != 0)
314 return version;
315
316 /* Check /proc/version_signature first as it's the way to get the mainline
317 * kernel version in Ubuntu. The format is:
318 * Ubuntu ubuntu_kernel_version mainline_kernel_version
319 * For example:
320 * Ubuntu 5.15.0-79.86-generic 5.15.111
321 */
322 if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
323 if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
324 goto calculate_version;
325
326 if (-1 == uname(&u))
327 return 0;
328
329 /* In Debian we need to check `version` instead of `release` to extract the
330 * mainline kernel version. This is an example of how it looks like:
331 * #1 SMP Debian 5.10.46-4 (2021-08-03)
332 */
333 needle = strstr(u.version, "Debian ");
334 if (needle != NULL)
335 if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
336 goto calculate_version;
337
338 if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
339 return 0;
340
341 /* Handle it when the process runs under the UNAME26 personality:
342 *
343 * - kernels >= 3.x identify as 2.6.40+x
344 * - kernels >= 4.x identify as 2.6.60+x
345 *
346 * UNAME26 is a poorly conceived hack that doesn't let us distinguish
347 * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
348 * that 2.6.60+x means 4.x.
349 *
350 * Fun fact of the day: it's technically possible to observe the actual
351 * kernel version for a brief moment because uname() first copies out the
352 * real release string before overwriting it with the backcompat string.
353 */
354 if (major == 2 && minor == 6) {
355 if (patch >= 60) {
356 major = 4;
357 minor = patch - 60;
358 patch = 0;
359 } else if (patch >= 40) {
360 major = 3;
361 minor = patch - 40;
362 patch = 0;
363 }
364 }
365
366 calculate_version:
367 version = major * 65536 + minor * 256 + patch;
368 atomic_store_explicit(&cached_version, version, memory_order_relaxed);
369
370 return version;
371 }
372
373
374 ssize_t
uv__fs_copy_file_range(int fd_in,off_t * off_in,int fd_out,off_t * off_out,size_t len,unsigned int flags)375 uv__fs_copy_file_range(int fd_in,
376 off_t* off_in,
377 int fd_out,
378 off_t* off_out,
379 size_t len,
380 unsigned int flags)
381 {
382 #ifdef __NR_copy_file_range
383 return syscall(__NR_copy_file_range,
384 fd_in,
385 off_in,
386 fd_out,
387 off_out,
388 len,
389 flags);
390 #else
391 return errno = ENOSYS, -1;
392 #endif
393 }
394
395
uv__statx(int dirfd,const char * path,int flags,unsigned int mask,struct uv__statx * statxbuf)396 int uv__statx(int dirfd,
397 const char* path,
398 int flags,
399 unsigned int mask,
400 struct uv__statx* statxbuf) {
401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
402 return errno = ENOSYS, -1;
403 #else
404 int rc;
405
406 rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
407 if (rc >= 0)
408 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
409
410 return rc;
411 #endif
412 }
413
414
uv__getrandom(void * buf,size_t buflen,unsigned flags)415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
417 return errno = ENOSYS, -1;
418 #else
419 ssize_t rc;
420
421 rc = syscall(__NR_getrandom, buf, buflen, flags);
422 if (rc >= 0)
423 uv__msan_unpoison(buf, buflen);
424
425 return rc;
426 #endif
427 }
428
429
uv__io_uring_setup(int entries,struct uv__io_uring_params * params)430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
431 return syscall(__NR_io_uring_setup, entries, params);
432 }
433
434
uv__io_uring_enter(int fd,unsigned to_submit,unsigned min_complete,unsigned flags)435 int uv__io_uring_enter(int fd,
436 unsigned to_submit,
437 unsigned min_complete,
438 unsigned flags) {
439 /* io_uring_enter used to take a sigset_t but it's unused
440 * in newer kernels unless IORING_ENTER_EXT_ARG is set,
441 * in which case it takes a struct io_uring_getevents_arg.
442 */
443 return syscall(__NR_io_uring_enter,
444 fd,
445 to_submit,
446 min_complete,
447 flags,
448 NULL,
449 0L);
450 }
451
452
uv__io_uring_register(int fd,unsigned opcode,void * arg,unsigned nargs)453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
454 return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
455 }
456
457
uv__use_io_uring(uint32_t flags)458 static int uv__use_io_uring(uint32_t flags) {
459 #if defined(__ANDROID_API__)
460 return 0; /* Possibly available but blocked by seccomp. */
461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
462 /* See https://github.com/libuv/libuv/issues/4158. */
463 return 0; /* All 32 bits kernels appear buggy. */
464 #elif defined(__powerpc64__) || defined(__ppc64__)
465 /* See https://github.com/libuv/libuv/issues/4283. */
466 return 0; /* Random SIGSEGV in signal handler. */
467 #else
468 /* Ternary: unknown=0, yes=1, no=-1 */
469 static _Atomic int use_io_uring;
470 char* val;
471 int use;
472
473 #if defined(__hppa__)
474 /* io_uring first supported on parisc in 6.1, functional in .51
475 * https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/
476 */
477 if (uv__kernel_version() < /*6.1.51*/0x060133)
478 return 0;
479 #endif
480
481 /* SQPOLL is all kinds of buggy but epoll batching should work fine. */
482 if (0 == (flags & UV__IORING_SETUP_SQPOLL))
483 return 1;
484
485 /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
486 if (uv__kernel_version() < /*5.10.186*/0x050ABA)
487 return 0;
488
489 use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
490
491 if (use == 0) {
492 val = getenv("UV_USE_IO_URING");
493 use = val != NULL && atoi(val) > 0 ? 1 : -1;
494 atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
495 }
496
497 return use > 0;
498 #endif
499 }
500
501
uv__iou_init(int epollfd,struct uv__iou * iou,uint32_t entries,uint32_t flags)502 static void uv__iou_init(int epollfd,
503 struct uv__iou* iou,
504 uint32_t entries,
505 uint32_t flags) {
506 struct uv__io_uring_params params;
507 struct epoll_event e;
508 size_t cqlen;
509 size_t sqlen;
510 size_t maxlen;
511 size_t sqelen;
512 unsigned kernel_version;
513 uint32_t* sqarray;
514 uint32_t i;
515 char* sq;
516 char* sqe;
517 int ringfd;
518 int no_sqarray;
519
520 sq = MAP_FAILED;
521 sqe = MAP_FAILED;
522
523 if (!uv__use_io_uring(flags))
524 return;
525
526 kernel_version = uv__kernel_version();
527 no_sqarray =
528 UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600);
529
530 /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
531 * Mostly academic because we check for a v5.13 kernel afterwards anyway.
532 */
533 memset(¶ms, 0, sizeof(params));
534 params.flags = flags | no_sqarray;
535
536 if (flags & UV__IORING_SETUP_SQPOLL)
537 params.sq_thread_idle = 10; /* milliseconds */
538
539 /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
540 ringfd = uv__io_uring_setup(entries, ¶ms);
541 if (ringfd == -1)
542 return;
543
544 /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
545 * actually detecting is whether IORING_OP_STATX works with SQPOLL.
546 */
547 if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
548 goto fail;
549
550 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
551 if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
552 goto fail;
553
554 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
555 if (!(params.features & UV__IORING_FEAT_NODROP))
556 goto fail;
557
558 sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
559 cqlen =
560 params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
561 maxlen = sqlen < cqlen ? cqlen : sqlen;
562 sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
563
564 sq = mmap(0,
565 maxlen,
566 PROT_READ | PROT_WRITE,
567 MAP_SHARED | MAP_POPULATE,
568 ringfd,
569 0); /* IORING_OFF_SQ_RING */
570
571 sqe = mmap(0,
572 sqelen,
573 PROT_READ | PROT_WRITE,
574 MAP_SHARED | MAP_POPULATE,
575 ringfd,
576 0x10000000ull); /* IORING_OFF_SQES */
577
578 if (sq == MAP_FAILED || sqe == MAP_FAILED)
579 goto fail;
580
581 if (flags & UV__IORING_SETUP_SQPOLL) {
582 /* Only interested in completion events. To get notified when
583 * the kernel pulls items from the submission ring, add POLLOUT.
584 */
585 memset(&e, 0, sizeof(e));
586 e.events = POLLIN;
587 e.data.fd = ringfd;
588
589 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
590 goto fail;
591 }
592
593 iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
594 iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
595 iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
596 iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
597 iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
598 iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
599 iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
600 iou->sq = sq;
601 iou->cqe = sq + params.cq_off.cqes;
602 iou->sqe = sqe;
603 iou->sqlen = sqlen;
604 iou->cqlen = cqlen;
605 iou->maxlen = maxlen;
606 iou->sqelen = sqelen;
607 iou->ringfd = ringfd;
608 iou->in_flight = 0;
609
610 if (no_sqarray)
611 return;
612
613 sqarray = (uint32_t*) (sq + params.sq_off.array);
614 for (i = 0; i <= iou->sqmask; i++)
615 sqarray[i] = i; /* Slot -> sqe identity mapping. */
616
617 return;
618
619 fail:
620 if (sq != MAP_FAILED)
621 munmap(sq, maxlen);
622
623 if (sqe != MAP_FAILED)
624 munmap(sqe, sqelen);
625
626 uv__close(ringfd);
627 }
628
629
uv__iou_delete(struct uv__iou * iou)630 static void uv__iou_delete(struct uv__iou* iou) {
631 if (iou->ringfd > -1) {
632 munmap(iou->sq, iou->maxlen);
633 munmap(iou->sqe, iou->sqelen);
634 uv__close(iou->ringfd);
635 iou->ringfd = -1;
636 }
637 }
638
639
uv__platform_loop_init(uv_loop_t * loop)640 int uv__platform_loop_init(uv_loop_t* loop) {
641 uv__loop_internal_fields_t* lfields;
642
643 lfields = uv__get_internal_fields(loop);
644 lfields->ctl.ringfd = -1;
645 lfields->iou.ringfd = -2; /* "uninitialized" */
646
647 loop->inotify_watchers = NULL;
648 loop->inotify_fd = -1;
649 loop->backend_fd = epoll_create1(O_CLOEXEC);
650
651 if (loop->backend_fd == -1)
652 return UV__ERR(errno);
653
654 uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
655
656 return 0;
657 }
658
659
uv__io_fork(uv_loop_t * loop)660 int uv__io_fork(uv_loop_t* loop) {
661 int err;
662 struct watcher_list* root;
663
664 root = uv__inotify_watchers(loop)->rbh_root;
665
666 uv__close(loop->backend_fd);
667 loop->backend_fd = -1;
668
669 /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
670 uv__platform_loop_delete(loop);
671
672 err = uv__platform_loop_init(loop);
673 if (err)
674 return err;
675
676 return uv__inotify_fork(loop, root);
677 }
678
679
uv__platform_loop_delete(uv_loop_t * loop)680 void uv__platform_loop_delete(uv_loop_t* loop) {
681 uv__loop_internal_fields_t* lfields;
682
683 lfields = uv__get_internal_fields(loop);
684 uv__iou_delete(&lfields->ctl);
685 uv__iou_delete(&lfields->iou);
686
687 if (loop->inotify_fd != -1) {
688 uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
689 uv__close(loop->inotify_fd);
690 loop->inotify_fd = -1;
691 }
692 }
693
694
695 struct uv__invalidate {
696 struct epoll_event (*prep)[256];
697 struct epoll_event* events;
698 int nfds;
699 };
700
701
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)702 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
703 uv__loop_internal_fields_t* lfields;
704 struct uv__invalidate* inv;
705 struct epoll_event dummy;
706 int i;
707
708 lfields = uv__get_internal_fields(loop);
709 inv = lfields->inv;
710
711 /* Invalidate events with same file descriptor */
712 if (inv != NULL)
713 for (i = 0; i < inv->nfds; i++)
714 if (inv->events[i].data.fd == fd)
715 inv->events[i].data.fd = -1;
716
717 /* Remove the file descriptor from the epoll.
718 * This avoids a problem where the same file description remains open
719 * in another process, causing repeated junk epoll events.
720 *
721 * Perform EPOLL_CTL_DEL immediately instead of going through
722 * io_uring's submit queue, otherwise the file descriptor may
723 * be closed by the time the kernel starts the operation.
724 *
725 * We pass in a dummy epoll_event, to work around a bug in old kernels.
726 *
727 * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
728 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
729 */
730 memset(&dummy, 0, sizeof(dummy));
731 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
732 }
733
734
uv__io_check_fd(uv_loop_t * loop,int fd)735 int uv__io_check_fd(uv_loop_t* loop, int fd) {
736 struct epoll_event e;
737 int rc;
738
739 memset(&e, 0, sizeof(e));
740 e.events = POLLIN;
741 e.data.fd = -1;
742
743 rc = 0;
744 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
745 if (errno != EEXIST)
746 rc = UV__ERR(errno);
747
748 if (rc == 0)
749 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
750 abort();
751
752 return rc;
753 }
754
755
756 /* Caller must initialize SQE and call uv__iou_submit(). */
uv__iou_get_sqe(struct uv__iou * iou,uv_loop_t * loop,uv_fs_t * req)757 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
758 uv_loop_t* loop,
759 uv_fs_t* req) {
760 struct uv__io_uring_sqe* sqe;
761 uint32_t head;
762 uint32_t tail;
763 uint32_t mask;
764 uint32_t slot;
765
766 /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
767 * initialization failed. Anything else is a valid ring file descriptor.
768 */
769 if (iou->ringfd == -2) {
770 /* By default, the SQPOLL is not created. Enable only if the loop is
771 * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING
772 * environment variable is unset or a positive number.
773 */
774 if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL)
775 if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL))
776 uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
777
778 if (iou->ringfd == -2)
779 iou->ringfd = -1; /* "failed" */
780 }
781
782 if (iou->ringfd == -1)
783 return NULL;
784
785 head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
786 memory_order_acquire);
787 tail = *iou->sqtail;
788 mask = iou->sqmask;
789
790 if ((head & mask) == ((tail + 1) & mask))
791 return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
792
793 slot = tail & mask;
794 sqe = iou->sqe;
795 sqe = &sqe[slot];
796 memset(sqe, 0, sizeof(*sqe));
797 sqe->user_data = (uintptr_t) req;
798
799 /* Pacify uv_cancel(). */
800 req->work_req.loop = loop;
801 req->work_req.work = NULL;
802 req->work_req.done = NULL;
803 uv__queue_init(&req->work_req.wq);
804
805 uv__req_register(loop);
806 iou->in_flight++;
807
808 return sqe;
809 }
810
811
uv__iou_submit(struct uv__iou * iou)812 static void uv__iou_submit(struct uv__iou* iou) {
813 uint32_t flags;
814
815 atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
816 *iou->sqtail + 1,
817 memory_order_release);
818
819 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
820 memory_order_acquire);
821
822 if (flags & UV__IORING_SQ_NEED_WAKEUP)
823 if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
824 if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */
825 perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */
826 }
827
828
uv__iou_fs_close(uv_loop_t * loop,uv_fs_t * req)829 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
830 struct uv__io_uring_sqe* sqe;
831 struct uv__iou* iou;
832 int kv;
833
834 kv = uv__kernel_version();
835 /* Work around a poorly understood bug in older kernels where closing a file
836 * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
837 * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
838 * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
839 * but good candidates are the several data race fixes. Interestingly, it
840 * seems to manifest only when running under Docker so the possibility of
841 * a Docker bug can't be completely ruled out either. Yay, computers.
842 * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
843 * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
844 * solved.
845 */
846 if (kv < /* 5.15.90 */ 0x050F5A)
847 return 0;
848
849 if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
850 return 0;
851
852
853 iou = &uv__get_internal_fields(loop)->iou;
854
855 sqe = uv__iou_get_sqe(iou, loop, req);
856 if (sqe == NULL)
857 return 0;
858
859 sqe->fd = req->file;
860 sqe->opcode = UV__IORING_OP_CLOSE;
861
862 uv__iou_submit(iou);
863
864 return 1;
865 }
866
867
uv__iou_fs_ftruncate(uv_loop_t * loop,uv_fs_t * req)868 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) {
869 struct uv__io_uring_sqe* sqe;
870 struct uv__iou* iou;
871
872 if (uv__kernel_version() < /* 6.9 */0x060900)
873 return 0;
874
875 iou = &uv__get_internal_fields(loop)->iou;
876 sqe = uv__iou_get_sqe(iou, loop, req);
877 if (sqe == NULL)
878 return 0;
879
880 sqe->fd = req->file;
881 sqe->len = req->off;
882 sqe->opcode = UV__IORING_OP_FTRUNCATE;
883 uv__iou_submit(iou);
884
885 return 1;
886 }
887
uv__iou_fs_fsync_or_fdatasync(uv_loop_t * loop,uv_fs_t * req,uint32_t fsync_flags)888 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
889 uv_fs_t* req,
890 uint32_t fsync_flags) {
891 struct uv__io_uring_sqe* sqe;
892 struct uv__iou* iou;
893
894 iou = &uv__get_internal_fields(loop)->iou;
895
896 sqe = uv__iou_get_sqe(iou, loop, req);
897 if (sqe == NULL)
898 return 0;
899
900 /* Little known fact: setting seq->off and seq->len turns
901 * it into an asynchronous sync_file_range() operation.
902 */
903 sqe->fd = req->file;
904 sqe->fsync_flags = fsync_flags;
905 sqe->opcode = UV__IORING_OP_FSYNC;
906
907 uv__iou_submit(iou);
908
909 return 1;
910 }
911
912
uv__iou_fs_link(uv_loop_t * loop,uv_fs_t * req)913 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
914 struct uv__io_uring_sqe* sqe;
915 struct uv__iou* iou;
916
917 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
918 return 0;
919
920 iou = &uv__get_internal_fields(loop)->iou;
921 sqe = uv__iou_get_sqe(iou, loop, req);
922 if (sqe == NULL)
923 return 0;
924
925 sqe->addr = (uintptr_t) req->path;
926 sqe->fd = AT_FDCWD;
927 sqe->addr2 = (uintptr_t) req->new_path;
928 sqe->len = AT_FDCWD;
929 sqe->opcode = UV__IORING_OP_LINKAT;
930
931 uv__iou_submit(iou);
932
933 return 1;
934 }
935
936
uv__iou_fs_mkdir(uv_loop_t * loop,uv_fs_t * req)937 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
938 struct uv__io_uring_sqe* sqe;
939 struct uv__iou* iou;
940
941 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
942 return 0;
943
944 iou = &uv__get_internal_fields(loop)->iou;
945 sqe = uv__iou_get_sqe(iou, loop, req);
946 if (sqe == NULL)
947 return 0;
948
949 sqe->addr = (uintptr_t) req->path;
950 sqe->fd = AT_FDCWD;
951 sqe->len = req->mode;
952 sqe->opcode = UV__IORING_OP_MKDIRAT;
953
954 uv__iou_submit(iou);
955
956 return 1;
957 }
958
959
uv__iou_fs_open(uv_loop_t * loop,uv_fs_t * req)960 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
961 struct uv__io_uring_sqe* sqe;
962 struct uv__iou* iou;
963
964 iou = &uv__get_internal_fields(loop)->iou;
965
966 sqe = uv__iou_get_sqe(iou, loop, req);
967 if (sqe == NULL)
968 return 0;
969
970 sqe->addr = (uintptr_t) req->path;
971 sqe->fd = AT_FDCWD;
972 sqe->len = req->mode;
973 sqe->opcode = UV__IORING_OP_OPENAT;
974 sqe->open_flags = req->flags | O_CLOEXEC;
975
976 uv__iou_submit(iou);
977
978 return 1;
979 }
980
981
uv__iou_fs_rename(uv_loop_t * loop,uv_fs_t * req)982 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
983 struct uv__io_uring_sqe* sqe;
984 struct uv__iou* iou;
985
986 iou = &uv__get_internal_fields(loop)->iou;
987
988 sqe = uv__iou_get_sqe(iou, loop, req);
989 if (sqe == NULL)
990 return 0;
991
992 sqe->addr = (uintptr_t) req->path;
993 sqe->fd = AT_FDCWD;
994 sqe->addr2 = (uintptr_t) req->new_path;
995 sqe->len = AT_FDCWD;
996 sqe->opcode = UV__IORING_OP_RENAMEAT;
997
998 uv__iou_submit(iou);
999
1000 return 1;
1001 }
1002
1003
uv__iou_fs_symlink(uv_loop_t * loop,uv_fs_t * req)1004 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
1005 struct uv__io_uring_sqe* sqe;
1006 struct uv__iou* iou;
1007
1008 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
1009 return 0;
1010
1011 iou = &uv__get_internal_fields(loop)->iou;
1012 sqe = uv__iou_get_sqe(iou, loop, req);
1013 if (sqe == NULL)
1014 return 0;
1015
1016 sqe->addr = (uintptr_t) req->path;
1017 sqe->fd = AT_FDCWD;
1018 sqe->addr2 = (uintptr_t) req->new_path;
1019 sqe->opcode = UV__IORING_OP_SYMLINKAT;
1020
1021 uv__iou_submit(iou);
1022
1023 return 1;
1024 }
1025
1026
uv__iou_fs_unlink(uv_loop_t * loop,uv_fs_t * req)1027 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
1028 struct uv__io_uring_sqe* sqe;
1029 struct uv__iou* iou;
1030
1031 iou = &uv__get_internal_fields(loop)->iou;
1032
1033 sqe = uv__iou_get_sqe(iou, loop, req);
1034 if (sqe == NULL)
1035 return 0;
1036
1037 sqe->addr = (uintptr_t) req->path;
1038 sqe->fd = AT_FDCWD;
1039 sqe->opcode = UV__IORING_OP_UNLINKAT;
1040
1041 uv__iou_submit(iou);
1042
1043 return 1;
1044 }
1045
1046
uv__iou_fs_read_or_write(uv_loop_t * loop,uv_fs_t * req,int is_read)1047 int uv__iou_fs_read_or_write(uv_loop_t* loop,
1048 uv_fs_t* req,
1049 int is_read) {
1050 struct uv__io_uring_sqe* sqe;
1051 struct uv__iou* iou;
1052
1053 /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1054 * to the threadpool on writes */
1055 if (req->nbufs > IOV_MAX) {
1056 if (is_read)
1057 req->nbufs = IOV_MAX;
1058 else
1059 return 0;
1060 }
1061
1062 iou = &uv__get_internal_fields(loop)->iou;
1063
1064 sqe = uv__iou_get_sqe(iou, loop, req);
1065 if (sqe == NULL)
1066 return 0;
1067
1068 sqe->addr = (uintptr_t) req->bufs;
1069 sqe->fd = req->file;
1070 sqe->len = req->nbufs;
1071 sqe->off = req->off < 0 ? -1 : req->off;
1072 sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1073
1074 uv__iou_submit(iou);
1075
1076 return 1;
1077 }
1078
1079
uv__iou_fs_statx(uv_loop_t * loop,uv_fs_t * req,int is_fstat,int is_lstat)1080 int uv__iou_fs_statx(uv_loop_t* loop,
1081 uv_fs_t* req,
1082 int is_fstat,
1083 int is_lstat) {
1084 struct uv__io_uring_sqe* sqe;
1085 struct uv__statx* statxbuf;
1086 struct uv__iou* iou;
1087
1088 statxbuf = uv__malloc(sizeof(*statxbuf));
1089 if (statxbuf == NULL)
1090 return 0;
1091
1092 iou = &uv__get_internal_fields(loop)->iou;
1093
1094 sqe = uv__iou_get_sqe(iou, loop, req);
1095 if (sqe == NULL) {
1096 uv__free(statxbuf);
1097 return 0;
1098 }
1099
1100 req->ptr = statxbuf;
1101
1102 sqe->addr = (uintptr_t) req->path;
1103 sqe->addr2 = (uintptr_t) statxbuf;
1104 sqe->fd = AT_FDCWD;
1105 sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1106 sqe->opcode = UV__IORING_OP_STATX;
1107
1108 if (is_fstat) {
1109 sqe->addr = (uintptr_t) "";
1110 sqe->fd = req->file;
1111 sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1112 }
1113
1114 if (is_lstat)
1115 sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1116
1117 uv__iou_submit(iou);
1118
1119 return 1;
1120 }
1121
1122
uv__statx_to_stat(const struct uv__statx * statxbuf,uv_stat_t * buf)1123 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1124 buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1125 buf->st_mode = statxbuf->stx_mode;
1126 buf->st_nlink = statxbuf->stx_nlink;
1127 buf->st_uid = statxbuf->stx_uid;
1128 buf->st_gid = statxbuf->stx_gid;
1129 buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1130 buf->st_ino = statxbuf->stx_ino;
1131 buf->st_size = statxbuf->stx_size;
1132 buf->st_blksize = statxbuf->stx_blksize;
1133 buf->st_blocks = statxbuf->stx_blocks;
1134 buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1135 buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1136 buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1137 buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1138 buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1139 buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1140 buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1141 buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1142 buf->st_flags = 0;
1143 buf->st_gen = 0;
1144 }
1145
1146
uv__iou_fs_statx_post(uv_fs_t * req)1147 static void uv__iou_fs_statx_post(uv_fs_t* req) {
1148 struct uv__statx* statxbuf;
1149 uv_stat_t* buf;
1150
1151 buf = &req->statbuf;
1152 statxbuf = req->ptr;
1153 req->ptr = NULL;
1154
1155 if (req->result == 0) {
1156 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1157 uv__statx_to_stat(statxbuf, buf);
1158 req->ptr = buf;
1159 }
1160
1161 uv__free(statxbuf);
1162 }
1163
1164
uv__poll_io_uring(uv_loop_t * loop,struct uv__iou * iou)1165 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1166 struct uv__io_uring_cqe* cqe;
1167 struct uv__io_uring_cqe* e;
1168 uv_fs_t* req;
1169 uint32_t head;
1170 uint32_t tail;
1171 uint32_t mask;
1172 uint32_t i;
1173 uint32_t flags;
1174 int nevents;
1175 int rc;
1176
1177 head = *iou->cqhead;
1178 tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1179 memory_order_acquire);
1180 mask = iou->cqmask;
1181 cqe = iou->cqe;
1182 nevents = 0;
1183
1184 for (i = head; i != tail; i++) {
1185 e = &cqe[i & mask];
1186
1187 req = (uv_fs_t*) (uintptr_t) e->user_data;
1188 assert(req->type == UV_FS);
1189
1190 uv__req_unregister(loop);
1191 iou->in_flight--;
1192
1193 /* If the op is not supported by the kernel retry using the thread pool */
1194 if (e->res == -EOPNOTSUPP) {
1195 uv__fs_post(loop, req);
1196 continue;
1197 }
1198
1199 /* io_uring stores error codes as negative numbers, same as libuv. */
1200 req->result = e->res;
1201
1202 switch (req->fs_type) {
1203 case UV_FS_FSTAT:
1204 case UV_FS_LSTAT:
1205 case UV_FS_STAT:
1206 uv__iou_fs_statx_post(req);
1207 break;
1208 default: /* Squelch -Wswitch warnings. */
1209 break;
1210 }
1211
1212 uv__metrics_update_idle_time(loop);
1213 req->cb(req);
1214 nevents++;
1215 }
1216
1217 atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1218 tail,
1219 memory_order_release);
1220
1221 /* Check whether CQE's overflowed, if so enter the kernel to make them
1222 * available. Don't grab them immediately but in the next loop iteration to
1223 * avoid loop starvation. */
1224 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1225 memory_order_acquire);
1226
1227 if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1228 do
1229 rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1230 while (rc == -1 && errno == EINTR);
1231
1232 if (rc < 0)
1233 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1234 }
1235
1236 uv__metrics_inc_events(loop, nevents);
1237 if (uv__get_internal_fields(loop)->current_timeout == 0)
1238 uv__metrics_inc_events_waiting(loop, nevents);
1239 }
1240
1241
1242 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1243 * executed immediately, otherwise the file descriptor may have been closed
1244 * by the time the kernel starts the operation.
1245 */
uv__epoll_ctl_prep(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256],int op,int fd,struct epoll_event * e)1246 static void uv__epoll_ctl_prep(int epollfd,
1247 struct uv__iou* ctl,
1248 struct epoll_event (*events)[256],
1249 int op,
1250 int fd,
1251 struct epoll_event* e) {
1252 struct uv__io_uring_sqe* sqe;
1253 struct epoll_event* pe;
1254 uint32_t mask;
1255 uint32_t slot;
1256
1257 assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1258 assert(ctl->ringfd != -1);
1259
1260 mask = ctl->sqmask;
1261 slot = (*ctl->sqtail)++ & mask;
1262
1263 pe = &(*events)[slot];
1264 *pe = *e;
1265
1266 sqe = ctl->sqe;
1267 sqe = &sqe[slot];
1268
1269 memset(sqe, 0, sizeof(*sqe));
1270 sqe->addr = (uintptr_t) pe;
1271 sqe->fd = epollfd;
1272 sqe->len = op;
1273 sqe->off = fd;
1274 sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1275 sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1276
1277 if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1278 uv__epoll_ctl_flush(epollfd, ctl, events);
1279 }
1280
1281
uv__epoll_ctl_flush(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256])1282 static void uv__epoll_ctl_flush(int epollfd,
1283 struct uv__iou* ctl,
1284 struct epoll_event (*events)[256]) {
1285 struct epoll_event oldevents[256];
1286 struct uv__io_uring_cqe* cqe;
1287 uint32_t oldslot;
1288 uint32_t slot;
1289 uint32_t n;
1290 int fd;
1291 int op;
1292 int rc;
1293
1294 STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1295 assert(ctl->ringfd != -1);
1296 assert(*ctl->sqhead != *ctl->sqtail);
1297
1298 n = *ctl->sqtail - *ctl->sqhead;
1299 do
1300 rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1301 while (rc == -1 && errno == EINTR);
1302
1303 if (rc < 0)
1304 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1305
1306 if (rc != (int) n)
1307 abort();
1308
1309 assert(*ctl->sqhead == *ctl->sqtail);
1310
1311 memcpy(oldevents, *events, sizeof(*events));
1312
1313 /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1314 * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1315 * that we are already watching. Ignore the former and retry the latter
1316 * with EPOLL_CTL_MOD.
1317 */
1318 while (*ctl->cqhead != *ctl->cqtail) {
1319 slot = (*ctl->cqhead)++ & ctl->cqmask;
1320
1321 cqe = ctl->cqe;
1322 cqe = &cqe[slot];
1323
1324 if (cqe->res == 0)
1325 continue;
1326
1327 fd = cqe->user_data >> 32;
1328 op = 3 & cqe->user_data;
1329 oldslot = 255 & (cqe->user_data >> 2);
1330
1331 if (op == EPOLL_CTL_DEL)
1332 continue;
1333
1334 if (op != EPOLL_CTL_ADD)
1335 abort();
1336
1337 if (cqe->res != -EEXIST)
1338 abort();
1339
1340 uv__epoll_ctl_prep(epollfd,
1341 ctl,
1342 events,
1343 EPOLL_CTL_MOD,
1344 fd,
1345 &oldevents[oldslot]);
1346 }
1347 }
1348
1349
uv__io_poll(uv_loop_t * loop,int timeout)1350 void uv__io_poll(uv_loop_t* loop, int timeout) {
1351 uv__loop_internal_fields_t* lfields;
1352 struct epoll_event events[1024];
1353 struct epoll_event prep[256];
1354 struct uv__invalidate inv;
1355 struct epoll_event* pe;
1356 struct epoll_event e;
1357 struct uv__iou* ctl;
1358 struct uv__iou* iou;
1359 int real_timeout;
1360 struct uv__queue* q;
1361 uv__io_t* w;
1362 sigset_t* sigmask;
1363 sigset_t sigset;
1364 uint64_t base;
1365 int have_iou_events;
1366 int have_signals;
1367 int nevents;
1368 int epollfd;
1369 int count;
1370 int nfds;
1371 int fd;
1372 int op;
1373 int i;
1374 int user_timeout;
1375 int reset_timeout;
1376
1377 lfields = uv__get_internal_fields(loop);
1378 ctl = &lfields->ctl;
1379 iou = &lfields->iou;
1380
1381 sigmask = NULL;
1382 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1383 sigemptyset(&sigset);
1384 sigaddset(&sigset, SIGPROF);
1385 sigmask = &sigset;
1386 }
1387
1388 assert(timeout >= -1);
1389 base = loop->time;
1390 count = 48; /* Benchmarks suggest this gives the best throughput. */
1391 real_timeout = timeout;
1392
1393 if (lfields->flags & UV_METRICS_IDLE_TIME) {
1394 reset_timeout = 1;
1395 user_timeout = timeout;
1396 timeout = 0;
1397 } else {
1398 reset_timeout = 0;
1399 user_timeout = 0;
1400 }
1401
1402 epollfd = loop->backend_fd;
1403
1404 memset(&e, 0, sizeof(e));
1405
1406 while (!uv__queue_empty(&loop->watcher_queue)) {
1407 q = uv__queue_head(&loop->watcher_queue);
1408 w = uv__queue_data(q, uv__io_t, watcher_queue);
1409 uv__queue_remove(q);
1410 uv__queue_init(q);
1411
1412 op = EPOLL_CTL_MOD;
1413 if (w->events == 0)
1414 op = EPOLL_CTL_ADD;
1415
1416 w->events = w->pevents;
1417 e.events = w->pevents;
1418 e.data.fd = w->fd;
1419 fd = w->fd;
1420
1421 if (ctl->ringfd != -1) {
1422 uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1423 continue;
1424 }
1425
1426 if (!epoll_ctl(epollfd, op, fd, &e))
1427 continue;
1428
1429 assert(op == EPOLL_CTL_ADD);
1430 assert(errno == EEXIST);
1431
1432 /* File descriptor that's been watched before, update event mask. */
1433 if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1434 abort();
1435 }
1436
1437 inv.events = events;
1438 inv.prep = &prep;
1439 inv.nfds = -1;
1440
1441 for (;;) {
1442 if (loop->nfds == 0)
1443 if (iou->in_flight == 0)
1444 break;
1445
1446 /* All event mask mutations should be visible to the kernel before
1447 * we enter epoll_pwait().
1448 */
1449 if (ctl->ringfd != -1)
1450 while (*ctl->sqhead != *ctl->sqtail)
1451 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1452
1453 /* Only need to set the provider_entry_time if timeout != 0. The function
1454 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
1455 */
1456 if (timeout != 0)
1457 uv__metrics_set_provider_entry_time(loop);
1458
1459 /* Store the current timeout in a location that's globally accessible so
1460 * other locations like uv__work_done() can determine whether the queue
1461 * of events in the callback were waiting when poll was called.
1462 */
1463 lfields->current_timeout = timeout;
1464
1465 nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1466
1467 /* Update loop->time unconditionally. It's tempting to skip the update when
1468 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
1469 * operating system didn't reschedule our process while in the syscall.
1470 */
1471 SAVE_ERRNO(uv__update_time(loop));
1472
1473 if (nfds == -1)
1474 assert(errno == EINTR);
1475 else if (nfds == 0)
1476 /* Unlimited timeout should only return with events or signal. */
1477 assert(timeout != -1);
1478
1479 if (nfds == 0 || nfds == -1) {
1480 if (reset_timeout != 0) {
1481 timeout = user_timeout;
1482 reset_timeout = 0;
1483 } else if (nfds == 0) {
1484 return;
1485 }
1486
1487 /* Interrupted by a signal. Update timeout and poll again. */
1488 goto update_timeout;
1489 }
1490
1491 have_iou_events = 0;
1492 have_signals = 0;
1493 nevents = 0;
1494
1495 inv.nfds = nfds;
1496 lfields->inv = &inv;
1497
1498 for (i = 0; i < nfds; i++) {
1499 pe = events + i;
1500 fd = pe->data.fd;
1501
1502 /* Skip invalidated events, see uv__platform_invalidate_fd */
1503 if (fd == -1)
1504 continue;
1505
1506 if (fd == iou->ringfd) {
1507 uv__poll_io_uring(loop, iou);
1508 have_iou_events = 1;
1509 continue;
1510 }
1511
1512 assert(fd >= 0);
1513 assert((unsigned) fd < loop->nwatchers);
1514
1515 w = loop->watchers[fd];
1516
1517 if (w == NULL) {
1518 /* File descriptor that we've stopped watching, disarm it.
1519 *
1520 * Ignore all errors because we may be racing with another thread
1521 * when the file descriptor is closed.
1522 *
1523 * Perform EPOLL_CTL_DEL immediately instead of going through
1524 * io_uring's submit queue, otherwise the file descriptor may
1525 * be closed by the time the kernel starts the operation.
1526 */
1527 epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1528 continue;
1529 }
1530
1531 /* Give users only events they're interested in. Prevents spurious
1532 * callbacks when previous callback invocation in this loop has stopped
1533 * the current watcher. Also, filters out events that users has not
1534 * requested us to watch.
1535 */
1536 pe->events &= w->pevents | POLLERR | POLLHUP;
1537
1538 /* Work around an epoll quirk where it sometimes reports just the
1539 * EPOLLERR or EPOLLHUP event. In order to force the event loop to
1540 * move forward, we merge in the read/write events that the watcher
1541 * is interested in; uv__read() and uv__write() will then deal with
1542 * the error or hangup in the usual fashion.
1543 *
1544 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1545 * reads the available data, calls uv_read_stop(), then sometime later
1546 * calls uv_read_start() again. By then, libuv has forgotten about the
1547 * hangup and the kernel won't report EPOLLIN again because there's
1548 * nothing left to read. If anything, libuv is to blame here. The
1549 * current hack is just a quick bandaid; to properly fix it, libuv
1550 * needs to remember the error/hangup event. We should get that for
1551 * free when we switch over to edge-triggered I/O.
1552 */
1553 if (pe->events == POLLERR || pe->events == POLLHUP)
1554 pe->events |=
1555 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1556
1557 if (pe->events != 0) {
1558 /* Run signal watchers last. This also affects child process watchers
1559 * because those are implemented in terms of signal watchers.
1560 */
1561 if (w == &loop->signal_io_watcher) {
1562 have_signals = 1;
1563 } else {
1564 uv__metrics_update_idle_time(loop);
1565 w->cb(loop, w, pe->events);
1566 }
1567
1568 nevents++;
1569 }
1570 }
1571
1572 uv__metrics_inc_events(loop, nevents);
1573 if (reset_timeout != 0) {
1574 timeout = user_timeout;
1575 reset_timeout = 0;
1576 uv__metrics_inc_events_waiting(loop, nevents);
1577 }
1578
1579 if (have_signals != 0) {
1580 uv__metrics_update_idle_time(loop);
1581 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
1582 }
1583
1584 lfields->inv = NULL;
1585
1586 if (have_iou_events != 0)
1587 break; /* Event loop should cycle now so don't poll again. */
1588
1589 if (have_signals != 0)
1590 break; /* Event loop should cycle now so don't poll again. */
1591
1592 if (nevents != 0) {
1593 if (nfds == ARRAY_SIZE(events) && --count != 0) {
1594 /* Poll for more events but don't block this time. */
1595 timeout = 0;
1596 continue;
1597 }
1598 break;
1599 }
1600
1601 update_timeout:
1602 if (timeout == 0)
1603 break;
1604
1605 if (timeout == -1)
1606 continue;
1607
1608 assert(timeout > 0);
1609
1610 real_timeout -= (loop->time - base);
1611 if (real_timeout <= 0)
1612 break;
1613
1614 timeout = real_timeout;
1615 }
1616
1617 if (ctl->ringfd != -1)
1618 while (*ctl->sqhead != *ctl->sqtail)
1619 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1620 }
1621
uv__hrtime(uv_clocktype_t type)1622 uint64_t uv__hrtime(uv_clocktype_t type) {
1623 static _Atomic clock_t fast_clock_id = -1;
1624 struct timespec t;
1625 clock_t clock_id;
1626
1627 /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1628 * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is
1629 * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1630 * decide to make a costly system call.
1631 */
1632 /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1633 * when it has microsecond granularity or better (unlikely).
1634 */
1635 clock_id = CLOCK_MONOTONIC;
1636 if (type != UV_CLOCK_FAST)
1637 goto done;
1638
1639 clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1640 if (clock_id != -1)
1641 goto done;
1642
1643 clock_id = CLOCK_MONOTONIC;
1644 if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1645 if (t.tv_nsec <= 1 * 1000 * 1000)
1646 clock_id = CLOCK_MONOTONIC_COARSE;
1647
1648 atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1649
1650 done:
1651
1652 if (clock_gettime(clock_id, &t))
1653 return 0; /* Not really possible. */
1654
1655 return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1656 }
1657
1658
uv_resident_set_memory(size_t * rss)1659 int uv_resident_set_memory(size_t* rss) {
1660 char buf[1024];
1661 const char* s;
1662 long val;
1663 int rc;
1664 int i;
1665
1666 /* rss: 24th element */
1667 rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1668 if (rc < 0)
1669 return rc;
1670
1671 /* find the last ')' */
1672 s = strrchr(buf, ')');
1673 if (s == NULL)
1674 goto err;
1675
1676 for (i = 1; i <= 22; i++) {
1677 s = strchr(s + 1, ' ');
1678 if (s == NULL)
1679 goto err;
1680 }
1681
1682 errno = 0;
1683 val = strtol(s, NULL, 10);
1684 if (val < 0 || errno != 0)
1685 goto err;
1686
1687 *rss = val * getpagesize();
1688 return 0;
1689
1690 err:
1691 return UV_EINVAL;
1692 }
1693
uv_uptime(double * uptime)1694 int uv_uptime(double* uptime) {
1695 struct timespec now;
1696 char buf[128];
1697
1698 /* Consult /proc/uptime when present (common case), or fall back to
1699 * clock_gettime. Why not always clock_gettime? It doesn't always return the
1700 * right result under OpenVZ and possibly other containerized environments.
1701 */
1702 if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1703 if (1 == sscanf(buf, "%lf", uptime))
1704 return 0;
1705
1706 if (clock_gettime(CLOCK_BOOTTIME, &now))
1707 return UV__ERR(errno);
1708
1709 *uptime = now.tv_sec;
1710 return 0;
1711 }
1712
1713
uv_cpu_info(uv_cpu_info_t ** ci,int * count)1714 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1715 #if defined(__PPC__)
1716 static const char model_marker[] = "cpu\t\t: ";
1717 static const char model_marker2[] = "";
1718 #elif defined(__arm__)
1719 static const char model_marker[] = "model name\t: ";
1720 static const char model_marker2[] = "Processor\t: ";
1721 #elif defined(__aarch64__)
1722 static const char model_marker[] = "CPU part\t: ";
1723 static const char model_marker2[] = "";
1724 #elif defined(__mips__)
1725 static const char model_marker[] = "cpu model\t\t: ";
1726 static const char model_marker2[] = "";
1727 #elif defined(__loongarch__)
1728 static const char model_marker[] = "cpu family\t\t: ";
1729 static const char model_marker2[] = "";
1730 #else
1731 static const char model_marker[] = "model name\t: ";
1732 static const char model_marker2[] = "";
1733 #endif
1734 static const char parts[] =
1735 #ifdef __aarch64__
1736 "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n"
1737 "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n"
1738 "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n"
1739 "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1740 "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n"
1741 "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n"
1742 "0xc0d\nCortex-A17\n" /* Originally A12 */
1743 "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n"
1744 "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n"
1745 "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n"
1746 "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n"
1747 "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n"
1748 "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n"
1749 "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n"
1750 "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n"
1751 "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n"
1752 "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n"
1753 "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n"
1754 #endif
1755 "";
1756 struct cpu {
1757 unsigned long long freq, user, nice, sys, idle, irq;
1758 unsigned model;
1759 };
1760 FILE* fp;
1761 char* p;
1762 int found;
1763 int n;
1764 unsigned i;
1765 unsigned cpu;
1766 unsigned maxcpu;
1767 unsigned size;
1768 unsigned long long skip;
1769 struct cpu (*cpus)[8192]; /* Kernel maximum. */
1770 struct cpu* c;
1771 struct cpu t;
1772 char (*model)[64];
1773 unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1774 /* Assumption: even big.LITTLE systems will have only a handful
1775 * of different CPU models. Most systems will just have one.
1776 */
1777 char models[8][64];
1778 char buf[1024];
1779
1780 memset(bitmap, 0, sizeof(bitmap));
1781 memset(models, 0, sizeof(models));
1782 snprintf(*models, sizeof(*models), "unknown");
1783 maxcpu = 0;
1784
1785 cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1786 if (cpus == NULL)
1787 return UV_ENOMEM;
1788
1789 fp = uv__open_file("/proc/stat");
1790 if (fp == NULL) {
1791 uv__free(cpus);
1792 return UV__ERR(errno);
1793 }
1794
1795 if (NULL == fgets(buf, sizeof(buf), fp))
1796 abort();
1797
1798 for (;;) {
1799 memset(&t, 0, sizeof(t));
1800
1801 n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1802 &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1803
1804 if (n != 7)
1805 break;
1806
1807 if (NULL == fgets(buf, sizeof(buf), fp))
1808 abort();
1809
1810 if (cpu >= ARRAY_SIZE(*cpus))
1811 continue;
1812
1813 (*cpus)[cpu] = t;
1814
1815 bitmap[cpu >> 3] |= 1 << (cpu & 7);
1816
1817 if (cpu >= maxcpu)
1818 maxcpu = cpu + 1;
1819 }
1820
1821 fclose(fp);
1822
1823 fp = uv__open_file("/proc/cpuinfo");
1824 if (fp == NULL)
1825 goto nocpuinfo;
1826
1827 for (;;) {
1828 if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1829 break; /* Parse error. */
1830
1831 while (fgets(buf, sizeof(buf), fp)) {
1832 if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) {
1833 p = buf + sizeof(model_marker) - 1;
1834 goto parts;
1835 }
1836 if (!*model_marker2)
1837 continue;
1838 if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) {
1839 p = buf + sizeof(model_marker2) - 1;
1840 goto parts;
1841 }
1842 }
1843
1844 goto next; /* Not found. */
1845
1846 parts:
1847 n = (int) strcspn(p, "\n");
1848
1849 /* arm64: translate CPU part code to model name. */
1850 if (*parts) {
1851 p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1852 if (p == NULL)
1853 p = "unknown";
1854 else
1855 p += n + 1;
1856 n = (int) strcspn(p, "\n");
1857 }
1858
1859 found = 0;
1860 for (model = models; !found && model < ARRAY_END(models); model++)
1861 found = !strncmp(p, *model, strlen(*model));
1862
1863 if (!found)
1864 goto next;
1865
1866 if (**model == '\0')
1867 snprintf(*model, sizeof(*model), "%.*s", n, p);
1868
1869 if (cpu < maxcpu)
1870 (*cpus)[cpu].model = model - models;
1871
1872 next:
1873 while (fgets(buf, sizeof(buf), fp))
1874 if (*buf == '\n')
1875 break;
1876 }
1877
1878 fclose(fp);
1879 fp = NULL;
1880
1881 nocpuinfo:
1882
1883 n = 0;
1884 for (cpu = 0; cpu < maxcpu; cpu++) {
1885 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1886 continue;
1887
1888 n++;
1889 snprintf(buf, sizeof(buf),
1890 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1891
1892 fp = uv__open_file(buf);
1893 if (fp == NULL)
1894 continue;
1895
1896 if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1897 abort();
1898 fclose(fp);
1899 fp = NULL;
1900 }
1901
1902 size = n * sizeof(**ci) + sizeof(models);
1903 *ci = uv__malloc(size);
1904 *count = 0;
1905
1906 if (*ci == NULL) {
1907 uv__free(cpus);
1908 return UV_ENOMEM;
1909 }
1910
1911 *count = n;
1912 p = memcpy(*ci + n, models, sizeof(models));
1913
1914 i = 0;
1915 for (cpu = 0; cpu < maxcpu; cpu++) {
1916 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1917 continue;
1918
1919 c = *cpus + cpu;
1920
1921 (*ci)[i++] = (uv_cpu_info_t) {
1922 .model = p + c->model * sizeof(*model),
1923 .speed = c->freq / 1000,
1924 /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1925 * therefore the multiplier is always 1000/100 = 10.
1926 */
1927 .cpu_times = (struct uv_cpu_times_s) {
1928 .user = 10 * c->user,
1929 .nice = 10 * c->nice,
1930 .sys = 10 * c->sys,
1931 .idle = 10 * c->idle,
1932 .irq = 10 * c->irq,
1933 },
1934 };
1935 }
1936
1937 uv__free(cpus);
1938
1939 return 0;
1940 }
1941
1942
uv__ifaddr_exclude(struct ifaddrs * ent,int exclude_type)1943 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1944 if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1945 return 1;
1946 if (ent->ifa_addr == NULL)
1947 return 1;
1948 /*
1949 * On Linux getifaddrs returns information related to the raw underlying
1950 * devices. We're not interested in this information yet.
1951 */
1952 if (ent->ifa_addr->sa_family == PF_PACKET)
1953 return exclude_type;
1954 return !exclude_type;
1955 }
1956
uv_interface_addresses(uv_interface_address_t ** addresses,int * count)1957 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1958 struct ifaddrs *addrs, *ent;
1959 uv_interface_address_t* address;
1960 int i;
1961 struct sockaddr_ll *sll;
1962
1963 *count = 0;
1964 *addresses = NULL;
1965
1966 if (getifaddrs(&addrs))
1967 return UV__ERR(errno);
1968
1969 /* Count the number of interfaces */
1970 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1971 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1972 continue;
1973
1974 (*count)++;
1975 }
1976
1977 if (*count == 0) {
1978 freeifaddrs(addrs);
1979 return 0;
1980 }
1981
1982 /* Make sure the memory is initiallized to zero using calloc() */
1983 *addresses = uv__calloc(*count, sizeof(**addresses));
1984 if (!(*addresses)) {
1985 freeifaddrs(addrs);
1986 return UV_ENOMEM;
1987 }
1988
1989 address = *addresses;
1990
1991 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1992 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1993 continue;
1994
1995 address->name = uv__strdup(ent->ifa_name);
1996
1997 if (ent->ifa_addr->sa_family == AF_INET6) {
1998 address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
1999 } else {
2000 address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
2001 }
2002
2003 if (ent->ifa_netmask->sa_family == AF_INET6) {
2004 address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
2005 } else {
2006 address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
2007 }
2008
2009 address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
2010
2011 address++;
2012 }
2013
2014 /* Fill in physical addresses for each interface */
2015 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
2016 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
2017 continue;
2018
2019 address = *addresses;
2020
2021 for (i = 0; i < (*count); i++) {
2022 size_t namelen = strlen(ent->ifa_name);
2023 /* Alias interface share the same physical address */
2024 if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
2025 (address->name[namelen] == 0 || address->name[namelen] == ':')) {
2026 sll = (struct sockaddr_ll*)ent->ifa_addr;
2027 memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
2028 }
2029 address++;
2030 }
2031 }
2032
2033 freeifaddrs(addrs);
2034
2035 return 0;
2036 }
2037
2038
uv_free_interface_addresses(uv_interface_address_t * addresses,int count)2039 void uv_free_interface_addresses(uv_interface_address_t* addresses,
2040 int count) {
2041 int i;
2042
2043 for (i = 0; i < count; i++) {
2044 uv__free(addresses[i].name);
2045 }
2046
2047 uv__free(addresses);
2048 }
2049
2050
uv__set_process_title(const char * title)2051 void uv__set_process_title(const char* title) {
2052 #if defined(PR_SET_NAME)
2053 prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */
2054 #endif
2055 }
2056
2057
uv__read_proc_meminfo(const char * what)2058 static uint64_t uv__read_proc_meminfo(const char* what) {
2059 uint64_t rc;
2060 char* p;
2061 char buf[4096]; /* Large enough to hold all of /proc/meminfo. */
2062
2063 if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2064 return 0;
2065
2066 p = strstr(buf, what);
2067
2068 if (p == NULL)
2069 return 0;
2070
2071 p += strlen(what);
2072
2073 rc = 0;
2074 sscanf(p, "%" PRIu64 " kB", &rc);
2075
2076 return rc * 1024;
2077 }
2078
2079
uv_get_free_memory(void)2080 uint64_t uv_get_free_memory(void) {
2081 struct sysinfo info;
2082 uint64_t rc;
2083
2084 rc = uv__read_proc_meminfo("MemAvailable:");
2085
2086 if (rc != 0)
2087 return rc;
2088
2089 if (0 == sysinfo(&info))
2090 return (uint64_t) info.freeram * info.mem_unit;
2091
2092 return 0;
2093 }
2094
2095
uv_get_total_memory(void)2096 uint64_t uv_get_total_memory(void) {
2097 struct sysinfo info;
2098 uint64_t rc;
2099
2100 rc = uv__read_proc_meminfo("MemTotal:");
2101
2102 if (rc != 0)
2103 return rc;
2104
2105 if (0 == sysinfo(&info))
2106 return (uint64_t) info.totalram * info.mem_unit;
2107
2108 return 0;
2109 }
2110
2111
uv__read_uint64(const char * filename)2112 static uint64_t uv__read_uint64(const char* filename) {
2113 char buf[32]; /* Large enough to hold an encoded uint64_t. */
2114 uint64_t rc;
2115
2116 rc = 0;
2117 if (0 == uv__slurp(filename, buf, sizeof(buf)))
2118 if (1 != sscanf(buf, "%" PRIu64, &rc))
2119 if (0 == strcmp(buf, "max\n"))
2120 rc = UINT64_MAX;
2121
2122 return rc;
2123 }
2124
2125
2126 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2127 * finds the location and length of the memory controller mount path.
2128 * This disregards the leading / for easy concatenation of paths.
2129 * Returns NULL if the memory controller wasn't found. */
uv__cgroup1_find_memory_controller(char buf[static1024],int * n)2130 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2131 int* n) {
2132 char* p;
2133
2134 /* Seek to the memory controller line. */
2135 p = strchr(buf, ':');
2136 while (p != NULL && strncmp(p, ":memory:", 8)) {
2137 p = strchr(p, '\n');
2138 if (p != NULL)
2139 p = strchr(p, ':');
2140 }
2141
2142 if (p != NULL) {
2143 /* Determine the length of the mount path. */
2144 p = p + strlen(":memory:/");
2145 *n = (int) strcspn(p, "\n");
2146 }
2147
2148 return p;
2149 }
2150
uv__get_cgroup1_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2151 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2152 uint64_t* max) {
2153 char filename[4097];
2154 char* p;
2155 int n;
2156 uint64_t cgroup1_max;
2157
2158 /* Find out where the controller is mounted. */
2159 p = uv__cgroup1_find_memory_controller(buf, &n);
2160 if (p != NULL) {
2161 snprintf(filename, sizeof(filename),
2162 "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2163 *high = uv__read_uint64(filename);
2164
2165 snprintf(filename, sizeof(filename),
2166 "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2167 *max = uv__read_uint64(filename);
2168
2169 /* If the controller wasn't mounted, the reads above will have failed,
2170 * as indicated by uv__read_uint64 returning 0.
2171 */
2172 if (*high != 0 && *max != 0)
2173 goto update_limits;
2174 }
2175
2176 /* Fall back to the limits of the global memory controller. */
2177 *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2178 *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2179
2180 /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2181 * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2182 */
2183 update_limits:
2184 cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2185 if (*high == cgroup1_max)
2186 *high = UINT64_MAX;
2187 if (*max == cgroup1_max)
2188 *max = UINT64_MAX;
2189 }
2190
uv__get_cgroup2_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2191 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2192 uint64_t* max) {
2193 char filename[4097];
2194 char* p;
2195 int n;
2196
2197 /* Find out where the controller is mounted. */
2198 p = buf + strlen("0::/");
2199 n = (int) strcspn(p, "\n");
2200
2201 /* Read the memory limits of the controller. */
2202 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2203 *max = uv__read_uint64(filename);
2204 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2205 *high = uv__read_uint64(filename);
2206 }
2207
uv__get_cgroup_constrained_memory(char buf[static1024])2208 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2209 uint64_t high;
2210 uint64_t max;
2211
2212 /* In the case of cgroupv2, we'll only have a single entry. */
2213 if (strncmp(buf, "0::/", 4))
2214 uv__get_cgroup1_memory_limits(buf, &high, &max);
2215 else
2216 uv__get_cgroup2_memory_limits(buf, &high, &max);
2217
2218 if (high == 0 || max == 0)
2219 return 0;
2220
2221 return high < max ? high : max;
2222 }
2223
uv_get_constrained_memory(void)2224 uint64_t uv_get_constrained_memory(void) {
2225 char buf[1024];
2226
2227 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2228 return 0;
2229
2230 return uv__get_cgroup_constrained_memory(buf);
2231 }
2232
2233
uv__get_cgroup1_current_memory(char buf[static1024])2234 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2235 char filename[4097];
2236 uint64_t current;
2237 char* p;
2238 int n;
2239
2240 /* Find out where the controller is mounted. */
2241 p = uv__cgroup1_find_memory_controller(buf, &n);
2242 if (p != NULL) {
2243 snprintf(filename, sizeof(filename),
2244 "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2245 current = uv__read_uint64(filename);
2246
2247 /* If the controller wasn't mounted, the reads above will have failed,
2248 * as indicated by uv__read_uint64 returning 0.
2249 */
2250 if (current != 0)
2251 return current;
2252 }
2253
2254 /* Fall back to the usage of the global memory controller. */
2255 return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2256 }
2257
uv__get_cgroup2_current_memory(char buf[static1024])2258 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2259 char filename[4097];
2260 char* p;
2261 int n;
2262
2263 /* Find out where the controller is mounted. */
2264 p = buf + strlen("0::/");
2265 n = (int) strcspn(p, "\n");
2266
2267 snprintf(filename, sizeof(filename),
2268 "/sys/fs/cgroup/%.*s/memory.current", n, p);
2269 return uv__read_uint64(filename);
2270 }
2271
uv_get_available_memory(void)2272 uint64_t uv_get_available_memory(void) {
2273 char buf[1024];
2274 uint64_t constrained;
2275 uint64_t current;
2276 uint64_t total;
2277
2278 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2279 return 0;
2280
2281 constrained = uv__get_cgroup_constrained_memory(buf);
2282 if (constrained == 0)
2283 return uv_get_free_memory();
2284
2285 total = uv_get_total_memory();
2286 if (constrained > total)
2287 return uv_get_free_memory();
2288
2289 /* In the case of cgroupv2, we'll only have a single entry. */
2290 if (strncmp(buf, "0::/", 4))
2291 current = uv__get_cgroup1_current_memory(buf);
2292 else
2293 current = uv__get_cgroup2_current_memory(buf);
2294
2295 /* memory usage can be higher than the limit (for short bursts of time) */
2296 if (constrained < current)
2297 return 0;
2298
2299 return constrained - current;
2300 }
2301
2302
uv__get_cgroupv2_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2303 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2304 uv__cpu_constraint* constraint) {
2305 char path[256];
2306 char buf[1024];
2307 unsigned int weight;
2308 int cgroup_size;
2309 const char* cgroup_trimmed;
2310 char quota_buf[16];
2311
2312 if (strncmp(cgroup, "0::/", 4) != 0)
2313 return UV_EINVAL;
2314
2315 /* Trim ending \n by replacing it with a 0 */
2316 cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */
2317 cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first slash */
2318
2319 /* Construct the path to the cpu.max file */
2320 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.max", cgroup_size,
2321 cgroup_trimmed);
2322
2323 /* Read cpu.max */
2324 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2325 return UV_EIO;
2326
2327 if (sscanf(buf, "%15s %llu", quota_buf, &constraint->period_length) != 2)
2328 return UV_EINVAL;
2329
2330 if (strncmp(quota_buf, "max", 3) == 0)
2331 constraint->quota_per_period = LLONG_MAX;
2332 else if (sscanf(quota_buf, "%lld", &constraint->quota_per_period) != 1)
2333 return UV_EINVAL; // conversion failed
2334
2335 /* Construct the path to the cpu.weight file */
2336 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.weight", cgroup_size,
2337 cgroup_trimmed);
2338
2339 /* Read cpu.weight */
2340 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2341 return UV_EIO;
2342
2343 if (sscanf(buf, "%u", &weight) != 1)
2344 return UV_EINVAL;
2345
2346 constraint->proportions = (double)weight / 100.0;
2347
2348 return 0;
2349 }
2350
uv__cgroup1_find_cpu_controller(const char * cgroup,int * cgroup_size)2351 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2352 int* cgroup_size) {
2353 /* Seek to the cpu controller line. */
2354 char* cgroup_cpu = strstr(cgroup, ":cpu,");
2355
2356 if (cgroup_cpu != NULL) {
2357 /* Skip the controller prefix to the start of the cgroup path. */
2358 cgroup_cpu += sizeof(":cpu,") - 1;
2359 /* Determine the length of the cgroup path, excluding the newline. */
2360 *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2361 }
2362
2363 return cgroup_cpu;
2364 }
2365
uv__get_cgroupv1_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2366 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2367 uv__cpu_constraint* constraint) {
2368 char path[256];
2369 char buf[1024];
2370 unsigned int shares;
2371 int cgroup_size;
2372 char* cgroup_cpu;
2373
2374 cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2375
2376 if (cgroup_cpu == NULL)
2377 return UV_EIO;
2378
2379 /* Construct the path to the cpu.cfs_quota_us file */
2380 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2381 cgroup_size, cgroup_cpu);
2382
2383 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2384 return UV_EIO;
2385
2386 if (sscanf(buf, "%lld", &constraint->quota_per_period) != 1)
2387 return UV_EINVAL;
2388
2389 /* Construct the path to the cpu.cfs_period_us file */
2390 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2391 cgroup_size, cgroup_cpu);
2392
2393 /* Read cpu.cfs_period_us */
2394 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2395 return UV_EIO;
2396
2397 if (sscanf(buf, "%lld", &constraint->period_length) != 1)
2398 return UV_EINVAL;
2399
2400 /* Construct the path to the cpu.shares file */
2401 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.shares", cgroup_size,
2402 cgroup_cpu);
2403
2404 /* Read cpu.shares */
2405 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2406 return UV_EIO;
2407
2408 if (sscanf(buf, "%u", &shares) != 1)
2409 return UV_EINVAL;
2410
2411 constraint->proportions = (double)shares / 1024.0;
2412
2413 return 0;
2414 }
2415
uv__get_constrained_cpu(uv__cpu_constraint * constraint)2416 int uv__get_constrained_cpu(uv__cpu_constraint* constraint) {
2417 char cgroup[1024];
2418
2419 /* Read the cgroup from /proc/self/cgroup */
2420 if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2421 return UV_EIO;
2422
2423 /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2424 * The entry for cgroup v2 is always in the format "0::$PATH"
2425 * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2426 if (strncmp(cgroup, "0::/", 4) == 0)
2427 return uv__get_cgroupv2_constrained_cpu(cgroup, constraint);
2428 else
2429 return uv__get_cgroupv1_constrained_cpu(cgroup, constraint);
2430 }
2431
2432
uv_loadavg(double avg[3])2433 void uv_loadavg(double avg[3]) {
2434 struct sysinfo info;
2435 char buf[128]; /* Large enough to hold all of /proc/loadavg. */
2436
2437 if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2438 if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2439 return;
2440
2441 if (sysinfo(&info) < 0)
2442 return;
2443
2444 avg[0] = (double) info.loads[0] / 65536.0;
2445 avg[1] = (double) info.loads[1] / 65536.0;
2446 avg[2] = (double) info.loads[2] / 65536.0;
2447 }
2448
2449
compare_watchers(const struct watcher_list * a,const struct watcher_list * b)2450 static int compare_watchers(const struct watcher_list* a,
2451 const struct watcher_list* b) {
2452 if (a->wd < b->wd) return -1;
2453 if (a->wd > b->wd) return 1;
2454 return 0;
2455 }
2456
2457
init_inotify(uv_loop_t * loop)2458 static int init_inotify(uv_loop_t* loop) {
2459 int fd;
2460
2461 if (loop->inotify_fd != -1)
2462 return 0;
2463
2464 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2465 if (fd < 0)
2466 return UV__ERR(errno);
2467
2468 loop->inotify_fd = fd;
2469 uv__io_init(&loop->inotify_read_watcher, uv__inotify_read, loop->inotify_fd);
2470 uv__io_start(loop, &loop->inotify_read_watcher, POLLIN);
2471
2472 return 0;
2473 }
2474
2475
uv__inotify_fork(uv_loop_t * loop,struct watcher_list * root)2476 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2477 /* Open the inotify_fd, and re-arm all the inotify watchers. */
2478 int err;
2479 struct watcher_list* tmp_watcher_list_iter;
2480 struct watcher_list* watcher_list;
2481 struct watcher_list tmp_watcher_list;
2482 struct uv__queue queue;
2483 struct uv__queue* q;
2484 uv_fs_event_t* handle;
2485 char* tmp_path;
2486
2487 if (root == NULL)
2488 return 0;
2489
2490 /* We must restore the old watcher list to be able to close items
2491 * out of it.
2492 */
2493 loop->inotify_watchers = root;
2494
2495 uv__queue_init(&tmp_watcher_list.watchers);
2496 /* Note that the queue we use is shared with the start and stop()
2497 * functions, making uv__queue_foreach unsafe to use. So we use the
2498 * uv__queue_move trick to safely iterate. Also don't free the watcher
2499 * list until we're done iterating. c.f. uv__inotify_read.
2500 */
2501 RB_FOREACH_SAFE(watcher_list, watcher_root,
2502 uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2503 watcher_list->iterating = 1;
2504 uv__queue_move(&watcher_list->watchers, &queue);
2505 while (!uv__queue_empty(&queue)) {
2506 q = uv__queue_head(&queue);
2507 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2508 /* It's critical to keep a copy of path here, because it
2509 * will be set to NULL by stop() and then deallocated by
2510 * maybe_free_watcher_list
2511 */
2512 tmp_path = uv__strdup(handle->path);
2513 assert(tmp_path != NULL);
2514 uv__queue_remove(q);
2515 uv__queue_insert_tail(&watcher_list->watchers, q);
2516 uv_fs_event_stop(handle);
2517
2518 uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2519 handle->path = tmp_path;
2520 }
2521 watcher_list->iterating = 0;
2522 maybe_free_watcher_list(watcher_list, loop);
2523 }
2524
2525 uv__queue_move(&tmp_watcher_list.watchers, &queue);
2526 while (!uv__queue_empty(&queue)) {
2527 q = uv__queue_head(&queue);
2528 uv__queue_remove(q);
2529 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2530 tmp_path = handle->path;
2531 handle->path = NULL;
2532 err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2533 uv__free(tmp_path);
2534 if (err)
2535 return err;
2536 }
2537
2538 return 0;
2539 }
2540
2541
find_watcher(uv_loop_t * loop,int wd)2542 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2543 struct watcher_list w;
2544 w.wd = wd;
2545 return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2546 }
2547
2548
maybe_free_watcher_list(struct watcher_list * w,uv_loop_t * loop)2549 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2550 /* if the watcher_list->watchers is being iterated over, we can't free it. */
2551 if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2552 /* No watchers left for this path. Clean up. */
2553 RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2554 inotify_rm_watch(loop->inotify_fd, w->wd);
2555 uv__free(w);
2556 }
2557 }
2558
2559
uv__inotify_read(uv_loop_t * loop,uv__io_t * dummy,unsigned int events)2560 static void uv__inotify_read(uv_loop_t* loop,
2561 uv__io_t* dummy,
2562 unsigned int events) {
2563 const struct inotify_event* e;
2564 struct watcher_list* w;
2565 uv_fs_event_t* h;
2566 struct uv__queue queue;
2567 struct uv__queue* q;
2568 const char* path;
2569 ssize_t size;
2570 const char *p;
2571 /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2572 char buf[4096];
2573
2574 for (;;) {
2575 do
2576 size = read(loop->inotify_fd, buf, sizeof(buf));
2577 while (size == -1 && errno == EINTR);
2578
2579 if (size == -1) {
2580 assert(errno == EAGAIN || errno == EWOULDBLOCK);
2581 break;
2582 }
2583
2584 assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2585
2586 /* Now we have one or more inotify_event structs. */
2587 for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2588 e = (const struct inotify_event*) p;
2589
2590 events = 0;
2591 if (e->mask & (IN_ATTRIB|IN_MODIFY))
2592 events |= UV_CHANGE;
2593 if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2594 events |= UV_RENAME;
2595
2596 w = find_watcher(loop, e->wd);
2597 if (w == NULL)
2598 continue; /* Stale event, no watchers left. */
2599
2600 /* inotify does not return the filename when monitoring a single file
2601 * for modifications. Repurpose the filename for API compatibility.
2602 * I'm not convinced this is a good thing, maybe it should go.
2603 */
2604 path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2605
2606 /* We're about to iterate over the queue and call user's callbacks.
2607 * What can go wrong?
2608 * A callback could call uv_fs_event_stop()
2609 * and the queue can change under our feet.
2610 * So, we use uv__queue_move() trick to safely iterate over the queue.
2611 * And we don't free the watcher_list until we're done iterating.
2612 *
2613 * First,
2614 * tell uv_fs_event_stop() (that could be called from a user's callback)
2615 * not to free watcher_list.
2616 */
2617 w->iterating = 1;
2618 uv__queue_move(&w->watchers, &queue);
2619 while (!uv__queue_empty(&queue)) {
2620 q = uv__queue_head(&queue);
2621 h = uv__queue_data(q, uv_fs_event_t, watchers);
2622
2623 uv__queue_remove(q);
2624 uv__queue_insert_tail(&w->watchers, q);
2625
2626 h->cb(h, path, events, 0);
2627 }
2628 /* done iterating, time to (maybe) free empty watcher_list */
2629 w->iterating = 0;
2630 maybe_free_watcher_list(w, loop);
2631 }
2632 }
2633 }
2634
2635
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)2636 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2637 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2638 return 0;
2639 }
2640
2641
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)2642 int uv_fs_event_start(uv_fs_event_t* handle,
2643 uv_fs_event_cb cb,
2644 const char* path,
2645 unsigned int flags) {
2646 struct watcher_list* w;
2647 uv_loop_t* loop;
2648 size_t len;
2649 int events;
2650 int err;
2651 int wd;
2652
2653 if (uv__is_active(handle))
2654 return UV_EINVAL;
2655
2656 loop = handle->loop;
2657
2658 err = init_inotify(loop);
2659 if (err)
2660 return err;
2661
2662 events = IN_ATTRIB
2663 | IN_CREATE
2664 | IN_MODIFY
2665 | IN_DELETE
2666 | IN_DELETE_SELF
2667 | IN_MOVE_SELF
2668 | IN_MOVED_FROM
2669 | IN_MOVED_TO;
2670
2671 wd = inotify_add_watch(loop->inotify_fd, path, events);
2672 if (wd == -1)
2673 return UV__ERR(errno);
2674
2675 w = find_watcher(loop, wd);
2676 if (w)
2677 goto no_insert;
2678
2679 len = strlen(path) + 1;
2680 w = uv__malloc(sizeof(*w) + len);
2681 if (w == NULL)
2682 return UV_ENOMEM;
2683
2684 w->wd = wd;
2685 w->path = memcpy(w + 1, path, len);
2686 uv__queue_init(&w->watchers);
2687 w->iterating = 0;
2688 RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2689
2690 no_insert:
2691 uv__handle_start(handle);
2692 uv__queue_insert_tail(&w->watchers, &handle->watchers);
2693 handle->path = w->path;
2694 handle->cb = cb;
2695 handle->wd = wd;
2696
2697 return 0;
2698 }
2699
2700
uv_fs_event_stop(uv_fs_event_t * handle)2701 int uv_fs_event_stop(uv_fs_event_t* handle) {
2702 struct watcher_list* w;
2703
2704 if (!uv__is_active(handle))
2705 return 0;
2706
2707 w = find_watcher(handle->loop, handle->wd);
2708 assert(w != NULL);
2709
2710 handle->wd = -1;
2711 handle->path = NULL;
2712 uv__handle_stop(handle);
2713 uv__queue_remove(&handle->watchers);
2714
2715 maybe_free_watcher_list(w, handle->loop);
2716
2717 return 0;
2718 }
2719
2720
uv__fs_event_close(uv_fs_event_t * handle)2721 void uv__fs_event_close(uv_fs_event_t* handle) {
2722 uv_fs_event_stop(handle);
2723 }
2724