xref: /libuv/src/unix/linux.c (revision e78e29c2)
1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2  * Permission is hereby granted, free of charge, to any person obtaining a copy
3  * of this software and associated documentation files (the "Software"), to
4  * deal in the Software without restriction, including without limitation the
5  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6  * sell copies of the Software, and to permit persons to whom the Software is
7  * furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in
10  * all copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18  * IN THE SOFTWARE.
19  */
20 
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22  * EPOLL* counterparts.  We use the POLL* variants in this file because that
23  * is what libuv uses elsewhere.
24  */
25 
26 #include "uv.h"
27 #include "internal.h"
28 
29 #include <inttypes.h>
30 #include <stdatomic.h>
31 #include <stddef.h>  /* offsetof */
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <errno.h>
38 
39 #include <fcntl.h>
40 #include <ifaddrs.h>
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <netpacket/packet.h>
44 #include <sys/epoll.h>
45 #include <sys/inotify.h>
46 #include <sys/mman.h>
47 #include <sys/param.h>
48 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/stat.h>
51 #include <sys/syscall.h>
52 #include <sys/sysinfo.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/utsname.h>
56 #include <time.h>
57 #include <unistd.h>
58 
59 #ifndef __NR_io_uring_setup
60 # define __NR_io_uring_setup 425
61 #endif
62 
63 #ifndef __NR_io_uring_enter
64 # define __NR_io_uring_enter 426
65 #endif
66 
67 #ifndef __NR_io_uring_register
68 # define __NR_io_uring_register 427
69 #endif
70 
71 #ifndef __NR_copy_file_range
72 # if defined(__x86_64__)
73 #  define __NR_copy_file_range 326
74 # elif defined(__i386__)
75 #  define __NR_copy_file_range 377
76 # elif defined(__s390__)
77 #  define __NR_copy_file_range 375
78 # elif defined(__arm__)
79 #  define __NR_copy_file_range 391
80 # elif defined(__aarch64__)
81 #  define __NR_copy_file_range 285
82 # elif defined(__powerpc__)
83 #  define __NR_copy_file_range 379
84 # elif defined(__arc__)
85 #  define __NR_copy_file_range 285
86 # elif defined(__riscv)
87 #  define __NR_copy_file_range 285
88 # endif
89 #endif /* __NR_copy_file_range */
90 
91 #ifndef __NR_statx
92 # if defined(__x86_64__)
93 #  define __NR_statx 332
94 # elif defined(__i386__)
95 #  define __NR_statx 383
96 # elif defined(__aarch64__)
97 #  define __NR_statx 397
98 # elif defined(__arm__)
99 #  define __NR_statx 397
100 # elif defined(__ppc__)
101 #  define __NR_statx 383
102 # elif defined(__s390__)
103 #  define __NR_statx 379
104 # elif defined(__riscv)
105 #  define __NR_statx 291
106 # endif
107 #endif /* __NR_statx */
108 
109 #ifndef __NR_getrandom
110 # if defined(__x86_64__)
111 #  define __NR_getrandom 318
112 # elif defined(__i386__)
113 #  define __NR_getrandom 355
114 # elif defined(__aarch64__)
115 #  define __NR_getrandom 384
116 # elif defined(__arm__)
117 #  define __NR_getrandom 384
118 # elif defined(__ppc__)
119 #  define __NR_getrandom 359
120 # elif defined(__s390__)
121 #  define __NR_getrandom 349
122 # elif defined(__riscv)
123 #  define __NR_getrandom 278
124 # endif
125 #endif /* __NR_getrandom */
126 
127 enum {
128   UV__IORING_SETUP_SQPOLL = 2u,
129 };
130 
131 enum {
132   UV__IORING_FEAT_SINGLE_MMAP = 1u,
133   UV__IORING_FEAT_NODROP = 2u,
134   UV__IORING_FEAT_RSRC_TAGS = 1024u,  /* linux v5.13 */
135 };
136 
137 enum {
138   UV__IORING_OP_READV = 1,
139   UV__IORING_OP_WRITEV = 2,
140   UV__IORING_OP_FSYNC = 3,
141   UV__IORING_OP_OPENAT = 18,
142   UV__IORING_OP_CLOSE = 19,
143   UV__IORING_OP_STATX = 21,
144   UV__IORING_OP_EPOLL_CTL = 29,
145   UV__IORING_OP_RENAMEAT = 35,
146   UV__IORING_OP_UNLINKAT = 36,
147   UV__IORING_OP_MKDIRAT = 37,
148   UV__IORING_OP_SYMLINKAT = 38,
149   UV__IORING_OP_LINKAT = 39,
150 };
151 
152 enum {
153   UV__IORING_ENTER_GETEVENTS = 1u,
154   UV__IORING_ENTER_SQ_WAKEUP = 2u,
155 };
156 
157 enum {
158   UV__IORING_SQ_NEED_WAKEUP = 1u,
159   UV__IORING_SQ_CQ_OVERFLOW = 2u,
160 };
161 
162 enum {
163   UV__MKDIRAT_SYMLINKAT_LINKAT = 1u,
164 };
165 
166 struct uv__io_cqring_offsets {
167   uint32_t head;
168   uint32_t tail;
169   uint32_t ring_mask;
170   uint32_t ring_entries;
171   uint32_t overflow;
172   uint32_t cqes;
173   uint64_t reserved0;
174   uint64_t reserved1;
175 };
176 
177 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
178 
179 struct uv__io_sqring_offsets {
180   uint32_t head;
181   uint32_t tail;
182   uint32_t ring_mask;
183   uint32_t ring_entries;
184   uint32_t flags;
185   uint32_t dropped;
186   uint32_t array;
187   uint32_t reserved0;
188   uint64_t reserved1;
189 };
190 
191 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
192 
193 struct uv__io_uring_cqe {
194   uint64_t user_data;
195   int32_t res;
196   uint32_t flags;
197 };
198 
199 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
200 
201 struct uv__io_uring_sqe {
202   uint8_t opcode;
203   uint8_t flags;
204   uint16_t ioprio;
205   int32_t fd;
206   union {
207     uint64_t off;
208     uint64_t addr2;
209   };
210   union {
211     uint64_t addr;
212   };
213   uint32_t len;
214   union {
215     uint32_t rw_flags;
216     uint32_t fsync_flags;
217     uint32_t open_flags;
218     uint32_t statx_flags;
219   };
220   uint64_t user_data;
221   union {
222     uint16_t buf_index;
223     uint64_t pad[3];
224   };
225 };
226 
227 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
228 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
229 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
230 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
231 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
232 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
233 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
234 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
235 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
236 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
237 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
238 
239 struct uv__io_uring_params {
240   uint32_t sq_entries;
241   uint32_t cq_entries;
242   uint32_t flags;
243   uint32_t sq_thread_cpu;
244   uint32_t sq_thread_idle;
245   uint32_t features;
246   uint32_t reserved[4];
247   struct uv__io_sqring_offsets sq_off;  /* 40 bytes */
248   struct uv__io_cqring_offsets cq_off;  /* 40 bytes */
249 };
250 
251 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
252 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
253 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
254 
255 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
256 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
257 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
258 
259 struct watcher_list {
260   RB_ENTRY(watcher_list) entry;
261   struct uv__queue watchers;
262   int iterating;
263   char* path;
264   int wd;
265 };
266 
267 struct watcher_root {
268   struct watcher_list* rbh_root;
269 };
270 
271 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
272 static void uv__inotify_read(uv_loop_t* loop,
273                              uv__io_t* w,
274                              unsigned int revents);
275 static int compare_watchers(const struct watcher_list* a,
276                             const struct watcher_list* b);
277 static void maybe_free_watcher_list(struct watcher_list* w,
278                                     uv_loop_t* loop);
279 
280 static void uv__epoll_ctl_flush(int epollfd,
281                                 struct uv__iou* ctl,
282                                 struct epoll_event (*events)[256]);
283 
284 static void uv__epoll_ctl_prep(int epollfd,
285                                struct uv__iou* ctl,
286                                struct epoll_event (*events)[256],
287                                int op,
288                                int fd,
289                                struct epoll_event* e);
290 
RB_GENERATE_STATIC(watcher_root,watcher_list,entry,compare_watchers)291 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
292 
293 
294 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
295   /* This cast works because watcher_root is a struct with a pointer as its
296    * sole member. Such type punning is unsafe in the presence of strict
297    * pointer aliasing (and is just plain nasty) but that is why libuv
298    * is compiled with -fno-strict-aliasing.
299    */
300   return (struct watcher_root*) &loop->inotify_watchers;
301 }
302 
303 
uv__kernel_version(void)304 unsigned uv__kernel_version(void) {
305   static _Atomic unsigned cached_version;
306   struct utsname u;
307   unsigned version;
308   unsigned major;
309   unsigned minor;
310   unsigned patch;
311   char v_sig[256];
312   char* needle;
313 
314   version = atomic_load_explicit(&cached_version, memory_order_relaxed);
315   if (version != 0)
316     return version;
317 
318   /* Check /proc/version_signature first as it's the way to get the mainline
319    * kernel version in Ubuntu. The format is:
320    *   Ubuntu ubuntu_kernel_version mainline_kernel_version
321    * For example:
322    *   Ubuntu 5.15.0-79.86-generic 5.15.111
323    */
324   if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
325     if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
326       goto calculate_version;
327 
328   if (-1 == uname(&u))
329     return 0;
330 
331   /* In Debian we need to check `version` instead of `release` to extract the
332    * mainline kernel version. This is an example of how it looks like:
333    *  #1 SMP Debian 5.10.46-4 (2021-08-03)
334    */
335   needle = strstr(u.version, "Debian ");
336   if (needle != NULL)
337     if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
338       goto calculate_version;
339 
340   if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
341     return 0;
342 
343   /* Handle it when the process runs under the UNAME26 personality:
344    *
345    * - kernels >= 3.x identify as 2.6.40+x
346    * - kernels >= 4.x identify as 2.6.60+x
347    *
348    * UNAME26 is a poorly conceived hack that doesn't let us distinguish
349    * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
350    * that 2.6.60+x means 4.x.
351    *
352    * Fun fact of the day: it's technically possible to observe the actual
353    * kernel version for a brief moment because uname() first copies out the
354    * real release string before overwriting it with the backcompat string.
355    */
356   if (major == 2 && minor == 6) {
357     if (patch >= 60) {
358       major = 4;
359       minor = patch - 60;
360       patch = 0;
361     } else if (patch >= 40) {
362       major = 3;
363       minor = patch - 40;
364       patch = 0;
365     }
366   }
367 
368 calculate_version:
369   version = major * 65536 + minor * 256 + patch;
370   atomic_store_explicit(&cached_version, version, memory_order_relaxed);
371 
372   return version;
373 }
374 
375 
376 ssize_t
uv__fs_copy_file_range(int fd_in,off_t * off_in,int fd_out,off_t * off_out,size_t len,unsigned int flags)377 uv__fs_copy_file_range(int fd_in,
378                        off_t* off_in,
379                        int fd_out,
380                        off_t* off_out,
381                        size_t len,
382                        unsigned int flags)
383 {
384 #ifdef __NR_copy_file_range
385   return syscall(__NR_copy_file_range,
386                  fd_in,
387                  off_in,
388                  fd_out,
389                  off_out,
390                  len,
391                  flags);
392 #else
393   return errno = ENOSYS, -1;
394 #endif
395 }
396 
397 
uv__statx(int dirfd,const char * path,int flags,unsigned int mask,struct uv__statx * statxbuf)398 int uv__statx(int dirfd,
399               const char* path,
400               int flags,
401               unsigned int mask,
402               struct uv__statx* statxbuf) {
403 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
404   return errno = ENOSYS, -1;
405 #else
406   int rc;
407 
408   rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
409   if (rc >= 0)
410     uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
411 
412   return rc;
413 #endif
414 }
415 
416 
uv__getrandom(void * buf,size_t buflen,unsigned flags)417 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
418 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
419   return errno = ENOSYS, -1;
420 #else
421   ssize_t rc;
422 
423   rc = syscall(__NR_getrandom, buf, buflen, flags);
424   if (rc >= 0)
425     uv__msan_unpoison(buf, buflen);
426 
427   return rc;
428 #endif
429 }
430 
431 
uv__io_uring_setup(int entries,struct uv__io_uring_params * params)432 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
433   return syscall(__NR_io_uring_setup, entries, params);
434 }
435 
436 
uv__io_uring_enter(int fd,unsigned to_submit,unsigned min_complete,unsigned flags)437 int uv__io_uring_enter(int fd,
438                        unsigned to_submit,
439                        unsigned min_complete,
440                        unsigned flags) {
441   /* io_uring_enter used to take a sigset_t but it's unused
442    * in newer kernels unless IORING_ENTER_EXT_ARG is set,
443    * in which case it takes a struct io_uring_getevents_arg.
444    */
445   return syscall(__NR_io_uring_enter,
446                  fd,
447                  to_submit,
448                  min_complete,
449                  flags,
450                  NULL,
451                  0L);
452 }
453 
454 
uv__io_uring_register(int fd,unsigned opcode,void * arg,unsigned nargs)455 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
456   return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
457 }
458 
459 
uv__use_io_uring(void)460 static int uv__use_io_uring(void) {
461 #if defined(__ANDROID_API__)
462   return 0;  /* Possibly available but blocked by seccomp. */
463 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
464   /* See https://github.com/libuv/libuv/issues/4158. */
465   return 0;  /* All 32 bits kernels appear buggy. */
466 #elif defined(__powerpc64__) || defined(__ppc64__)
467   /* See https://github.com/libuv/libuv/issues/4283. */
468   return 0; /* Random SIGSEGV in signal handler. */
469 #else
470   /* Ternary: unknown=0, yes=1, no=-1 */
471   static _Atomic int use_io_uring;
472   char* val;
473   int use;
474 
475   use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
476 
477   if (use == 0) {
478     use = uv__kernel_version() >=
479 #if defined(__hppa__)
480     /* io_uring first supported on parisc in 6.1, functional in .51 */
481     /* https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/ */
482     /* 6.1.51 */ 0x060133
483 #else
484     /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
485     /* 5.10.186 */ 0x050ABA
486 #endif
487     ? 1 : -1;
488 
489     /* But users can still enable it if they so desire. */
490     val = getenv("UV_USE_IO_URING");
491     if (val != NULL)
492       use = atoi(val) ? 1 : -1;
493 
494     atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
495   }
496 
497   return use > 0;
498 #endif
499 }
500 
501 
uv__iou_init(int epollfd,struct uv__iou * iou,uint32_t entries,uint32_t flags)502 static void uv__iou_init(int epollfd,
503                          struct uv__iou* iou,
504                          uint32_t entries,
505                          uint32_t flags) {
506   struct uv__io_uring_params params;
507   struct epoll_event e;
508   size_t cqlen;
509   size_t sqlen;
510   size_t maxlen;
511   size_t sqelen;
512   uint32_t i;
513   char* sq;
514   char* sqe;
515   int ringfd;
516 
517   sq = MAP_FAILED;
518   sqe = MAP_FAILED;
519 
520   if (!uv__use_io_uring())
521     return;
522 
523   /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
524    * Mostly academic because we check for a v5.13 kernel afterwards anyway.
525    */
526   memset(&params, 0, sizeof(params));
527   params.flags = flags;
528 
529   if (flags & UV__IORING_SETUP_SQPOLL)
530     params.sq_thread_idle = 10;  /* milliseconds */
531 
532   /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
533   ringfd = uv__io_uring_setup(entries, &params);
534   if (ringfd == -1)
535     return;
536 
537   /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
538    * actually detecting is whether IORING_OP_STATX works with SQPOLL.
539    */
540   if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
541     goto fail;
542 
543   /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
544   if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
545     goto fail;
546 
547   /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
548   if (!(params.features & UV__IORING_FEAT_NODROP))
549     goto fail;
550 
551   sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
552   cqlen =
553       params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
554   maxlen = sqlen < cqlen ? cqlen : sqlen;
555   sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
556 
557   sq = mmap(0,
558             maxlen,
559             PROT_READ | PROT_WRITE,
560             MAP_SHARED | MAP_POPULATE,
561             ringfd,
562             0);  /* IORING_OFF_SQ_RING */
563 
564   sqe = mmap(0,
565              sqelen,
566              PROT_READ | PROT_WRITE,
567              MAP_SHARED | MAP_POPULATE,
568              ringfd,
569              0x10000000ull);  /* IORING_OFF_SQES */
570 
571   if (sq == MAP_FAILED || sqe == MAP_FAILED)
572     goto fail;
573 
574   if (flags & UV__IORING_SETUP_SQPOLL) {
575     /* Only interested in completion events. To get notified when
576      * the kernel pulls items from the submission ring, add POLLOUT.
577      */
578     memset(&e, 0, sizeof(e));
579     e.events = POLLIN;
580     e.data.fd = ringfd;
581 
582     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
583       goto fail;
584   }
585 
586   iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
587   iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
588   iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
589   iou->sqarray = (uint32_t*) (sq + params.sq_off.array);
590   iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
591   iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
592   iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
593   iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
594   iou->sq = sq;
595   iou->cqe = sq + params.cq_off.cqes;
596   iou->sqe = sqe;
597   iou->sqlen = sqlen;
598   iou->cqlen = cqlen;
599   iou->maxlen = maxlen;
600   iou->sqelen = sqelen;
601   iou->ringfd = ringfd;
602   iou->in_flight = 0;
603   iou->flags = 0;
604 
605   if (uv__kernel_version() >= /* 5.15.0 */ 0x050F00)
606     iou->flags |= UV__MKDIRAT_SYMLINKAT_LINKAT;
607 
608   for (i = 0; i <= iou->sqmask; i++)
609     iou->sqarray[i] = i;  /* Slot -> sqe identity mapping. */
610 
611   return;
612 
613 fail:
614   if (sq != MAP_FAILED)
615     munmap(sq, maxlen);
616 
617   if (sqe != MAP_FAILED)
618     munmap(sqe, sqelen);
619 
620   uv__close(ringfd);
621 }
622 
623 
uv__iou_delete(struct uv__iou * iou)624 static void uv__iou_delete(struct uv__iou* iou) {
625   if (iou->ringfd > -1) {
626     munmap(iou->sq, iou->maxlen);
627     munmap(iou->sqe, iou->sqelen);
628     uv__close(iou->ringfd);
629     iou->ringfd = -1;
630   }
631 }
632 
633 
uv__platform_loop_init(uv_loop_t * loop)634 int uv__platform_loop_init(uv_loop_t* loop) {
635   uv__loop_internal_fields_t* lfields;
636 
637   lfields = uv__get_internal_fields(loop);
638   lfields->ctl.ringfd = -1;
639   lfields->iou.ringfd = -2;  /* "uninitialized" */
640 
641   loop->inotify_watchers = NULL;
642   loop->inotify_fd = -1;
643   loop->backend_fd = epoll_create1(O_CLOEXEC);
644 
645   if (loop->backend_fd == -1)
646     return UV__ERR(errno);
647 
648   uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
649 
650   return 0;
651 }
652 
653 
uv__io_fork(uv_loop_t * loop)654 int uv__io_fork(uv_loop_t* loop) {
655   int err;
656   struct watcher_list* root;
657 
658   root = uv__inotify_watchers(loop)->rbh_root;
659 
660   uv__close(loop->backend_fd);
661   loop->backend_fd = -1;
662 
663   /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
664   uv__platform_loop_delete(loop);
665 
666   err = uv__platform_loop_init(loop);
667   if (err)
668     return err;
669 
670   return uv__inotify_fork(loop, root);
671 }
672 
673 
uv__platform_loop_delete(uv_loop_t * loop)674 void uv__platform_loop_delete(uv_loop_t* loop) {
675   uv__loop_internal_fields_t* lfields;
676 
677   lfields = uv__get_internal_fields(loop);
678   uv__iou_delete(&lfields->ctl);
679   uv__iou_delete(&lfields->iou);
680 
681   if (loop->inotify_fd != -1) {
682     uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
683     uv__close(loop->inotify_fd);
684     loop->inotify_fd = -1;
685   }
686 }
687 
688 
689 struct uv__invalidate {
690   struct epoll_event (*prep)[256];
691   struct epoll_event* events;
692   int nfds;
693 };
694 
695 
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)696 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
697   uv__loop_internal_fields_t* lfields;
698   struct uv__invalidate* inv;
699   struct epoll_event dummy;
700   int i;
701 
702   lfields = uv__get_internal_fields(loop);
703   inv = lfields->inv;
704 
705   /* Invalidate events with same file descriptor */
706   if (inv != NULL)
707     for (i = 0; i < inv->nfds; i++)
708       if (inv->events[i].data.fd == fd)
709         inv->events[i].data.fd = -1;
710 
711   /* Remove the file descriptor from the epoll.
712    * This avoids a problem where the same file description remains open
713    * in another process, causing repeated junk epoll events.
714    *
715    * Perform EPOLL_CTL_DEL immediately instead of going through
716    * io_uring's submit queue, otherwise the file descriptor may
717    * be closed by the time the kernel starts the operation.
718    *
719    * We pass in a dummy epoll_event, to work around a bug in old kernels.
720    *
721    * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
722    * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
723    */
724   memset(&dummy, 0, sizeof(dummy));
725   epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
726 }
727 
728 
uv__io_check_fd(uv_loop_t * loop,int fd)729 int uv__io_check_fd(uv_loop_t* loop, int fd) {
730   struct epoll_event e;
731   int rc;
732 
733   memset(&e, 0, sizeof(e));
734   e.events = POLLIN;
735   e.data.fd = -1;
736 
737   rc = 0;
738   if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
739     if (errno != EEXIST)
740       rc = UV__ERR(errno);
741 
742   if (rc == 0)
743     if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
744       abort();
745 
746   return rc;
747 }
748 
749 
750 /* Caller must initialize SQE and call uv__iou_submit(). */
uv__iou_get_sqe(struct uv__iou * iou,uv_loop_t * loop,uv_fs_t * req)751 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
752                                                 uv_loop_t* loop,
753                                                 uv_fs_t* req) {
754   struct uv__io_uring_sqe* sqe;
755   uint32_t head;
756   uint32_t tail;
757   uint32_t mask;
758   uint32_t slot;
759 
760   /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
761    * initialization failed. Anything else is a valid ring file descriptor.
762    */
763   if (iou->ringfd == -2) {
764     /* By default, the SQPOLL is not created. Enable only if the loop is
765      * configured with UV_LOOP_USE_IO_URING_SQPOLL.
766      */
767     if ((loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL) == 0) {
768       iou->ringfd = -1;
769       return NULL;
770     }
771 
772     uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
773     if (iou->ringfd == -2)
774       iou->ringfd = -1;  /* "failed" */
775   }
776 
777   if (iou->ringfd == -1)
778     return NULL;
779 
780   head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
781                               memory_order_acquire);
782   tail = *iou->sqtail;
783   mask = iou->sqmask;
784 
785   if ((head & mask) == ((tail + 1) & mask))
786     return NULL;  /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
787 
788   slot = tail & mask;
789   sqe = iou->sqe;
790   sqe = &sqe[slot];
791   memset(sqe, 0, sizeof(*sqe));
792   sqe->user_data = (uintptr_t) req;
793 
794   /* Pacify uv_cancel(). */
795   req->work_req.loop = loop;
796   req->work_req.work = NULL;
797   req->work_req.done = NULL;
798   uv__queue_init(&req->work_req.wq);
799 
800   uv__req_register(loop);
801   iou->in_flight++;
802 
803   return sqe;
804 }
805 
806 
uv__iou_submit(struct uv__iou * iou)807 static void uv__iou_submit(struct uv__iou* iou) {
808   uint32_t flags;
809 
810   atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
811                         *iou->sqtail + 1,
812                         memory_order_release);
813 
814   flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
815                                memory_order_acquire);
816 
817   if (flags & UV__IORING_SQ_NEED_WAKEUP)
818     if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
819       if (errno != EOWNERDEAD)  /* Kernel bug. Harmless, ignore. */
820         perror("libuv: io_uring_enter(wakeup)");  /* Can't happen. */
821 }
822 
823 
uv__iou_fs_close(uv_loop_t * loop,uv_fs_t * req)824 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
825   struct uv__io_uring_sqe* sqe;
826   struct uv__iou* iou;
827   int kv;
828 
829   kv = uv__kernel_version();
830   /* Work around a poorly understood bug in older kernels where closing a file
831    * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
832    * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
833    * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
834    * but good candidates are the several data race fixes. Interestingly, it
835    * seems to manifest only when running under Docker so the possibility of
836    * a Docker bug can't be completely ruled out either. Yay, computers.
837    * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
838    * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
839    * solved.
840    */
841   if (kv < /* 5.15.90 */ 0x050F5A)
842     return 0;
843 
844   if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
845     return 0;
846 
847 
848   iou = &uv__get_internal_fields(loop)->iou;
849 
850   sqe = uv__iou_get_sqe(iou, loop, req);
851   if (sqe == NULL)
852     return 0;
853 
854   sqe->fd = req->file;
855   sqe->opcode = UV__IORING_OP_CLOSE;
856 
857   uv__iou_submit(iou);
858 
859   return 1;
860 }
861 
862 
uv__iou_fs_fsync_or_fdatasync(uv_loop_t * loop,uv_fs_t * req,uint32_t fsync_flags)863 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
864                                   uv_fs_t* req,
865                                   uint32_t fsync_flags) {
866   struct uv__io_uring_sqe* sqe;
867   struct uv__iou* iou;
868 
869   iou = &uv__get_internal_fields(loop)->iou;
870 
871   sqe = uv__iou_get_sqe(iou, loop, req);
872   if (sqe == NULL)
873     return 0;
874 
875   /* Little known fact: setting seq->off and seq->len turns
876    * it into an asynchronous sync_file_range() operation.
877    */
878   sqe->fd = req->file;
879   sqe->fsync_flags = fsync_flags;
880   sqe->opcode = UV__IORING_OP_FSYNC;
881 
882   uv__iou_submit(iou);
883 
884   return 1;
885 }
886 
887 
uv__iou_fs_link(uv_loop_t * loop,uv_fs_t * req)888 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
889   struct uv__io_uring_sqe* sqe;
890   struct uv__iou* iou;
891 
892   iou = &uv__get_internal_fields(loop)->iou;
893 
894   if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
895     return 0;
896 
897   sqe = uv__iou_get_sqe(iou, loop, req);
898   if (sqe == NULL)
899     return 0;
900 
901   sqe->addr = (uintptr_t) req->path;
902   sqe->fd = AT_FDCWD;
903   sqe->addr2 = (uintptr_t) req->new_path;
904   sqe->len = AT_FDCWD;
905   sqe->opcode = UV__IORING_OP_LINKAT;
906 
907   uv__iou_submit(iou);
908 
909   return 1;
910 }
911 
912 
uv__iou_fs_mkdir(uv_loop_t * loop,uv_fs_t * req)913 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
914   struct uv__io_uring_sqe* sqe;
915   struct uv__iou* iou;
916 
917   iou = &uv__get_internal_fields(loop)->iou;
918 
919   if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
920     return 0;
921 
922   sqe = uv__iou_get_sqe(iou, loop, req);
923   if (sqe == NULL)
924     return 0;
925 
926   sqe->addr = (uintptr_t) req->path;
927   sqe->fd = AT_FDCWD;
928   sqe->len = req->mode;
929   sqe->opcode = UV__IORING_OP_MKDIRAT;
930 
931   uv__iou_submit(iou);
932 
933   return 1;
934 }
935 
936 
uv__iou_fs_open(uv_loop_t * loop,uv_fs_t * req)937 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
938   struct uv__io_uring_sqe* sqe;
939   struct uv__iou* iou;
940 
941   iou = &uv__get_internal_fields(loop)->iou;
942 
943   sqe = uv__iou_get_sqe(iou, loop, req);
944   if (sqe == NULL)
945     return 0;
946 
947   sqe->addr = (uintptr_t) req->path;
948   sqe->fd = AT_FDCWD;
949   sqe->len = req->mode;
950   sqe->opcode = UV__IORING_OP_OPENAT;
951   sqe->open_flags = req->flags | O_CLOEXEC;
952 
953   uv__iou_submit(iou);
954 
955   return 1;
956 }
957 
958 
uv__iou_fs_rename(uv_loop_t * loop,uv_fs_t * req)959 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
960   struct uv__io_uring_sqe* sqe;
961   struct uv__iou* iou;
962 
963   iou = &uv__get_internal_fields(loop)->iou;
964 
965   sqe = uv__iou_get_sqe(iou, loop, req);
966   if (sqe == NULL)
967     return 0;
968 
969   sqe->addr = (uintptr_t) req->path;
970   sqe->fd = AT_FDCWD;
971   sqe->addr2 = (uintptr_t) req->new_path;
972   sqe->len = AT_FDCWD;
973   sqe->opcode = UV__IORING_OP_RENAMEAT;
974 
975   uv__iou_submit(iou);
976 
977   return 1;
978 }
979 
980 
uv__iou_fs_symlink(uv_loop_t * loop,uv_fs_t * req)981 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
982   struct uv__io_uring_sqe* sqe;
983   struct uv__iou* iou;
984 
985   iou = &uv__get_internal_fields(loop)->iou;
986 
987   if (!(iou->flags & UV__MKDIRAT_SYMLINKAT_LINKAT))
988     return 0;
989 
990   sqe = uv__iou_get_sqe(iou, loop, req);
991   if (sqe == NULL)
992     return 0;
993 
994   sqe->addr = (uintptr_t) req->path;
995   sqe->fd = AT_FDCWD;
996   sqe->addr2 = (uintptr_t) req->new_path;
997   sqe->opcode = UV__IORING_OP_SYMLINKAT;
998 
999   uv__iou_submit(iou);
1000 
1001   return 1;
1002 }
1003 
1004 
uv__iou_fs_unlink(uv_loop_t * loop,uv_fs_t * req)1005 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
1006   struct uv__io_uring_sqe* sqe;
1007   struct uv__iou* iou;
1008 
1009   iou = &uv__get_internal_fields(loop)->iou;
1010 
1011   sqe = uv__iou_get_sqe(iou, loop, req);
1012   if (sqe == NULL)
1013     return 0;
1014 
1015   sqe->addr = (uintptr_t) req->path;
1016   sqe->fd = AT_FDCWD;
1017   sqe->opcode = UV__IORING_OP_UNLINKAT;
1018 
1019   uv__iou_submit(iou);
1020 
1021   return 1;
1022 }
1023 
1024 
uv__iou_fs_read_or_write(uv_loop_t * loop,uv_fs_t * req,int is_read)1025 int uv__iou_fs_read_or_write(uv_loop_t* loop,
1026                              uv_fs_t* req,
1027                              int is_read) {
1028   struct uv__io_uring_sqe* sqe;
1029   struct uv__iou* iou;
1030 
1031   /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1032    * to the threadpool on writes */
1033   if (req->nbufs > IOV_MAX) {
1034     if (is_read)
1035       req->nbufs = IOV_MAX;
1036     else
1037       return 0;
1038   }
1039 
1040   iou = &uv__get_internal_fields(loop)->iou;
1041 
1042   sqe = uv__iou_get_sqe(iou, loop, req);
1043   if (sqe == NULL)
1044     return 0;
1045 
1046   sqe->addr = (uintptr_t) req->bufs;
1047   sqe->fd = req->file;
1048   sqe->len = req->nbufs;
1049   sqe->off = req->off < 0 ? -1 : req->off;
1050   sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1051 
1052   uv__iou_submit(iou);
1053 
1054   return 1;
1055 }
1056 
1057 
uv__iou_fs_statx(uv_loop_t * loop,uv_fs_t * req,int is_fstat,int is_lstat)1058 int uv__iou_fs_statx(uv_loop_t* loop,
1059                      uv_fs_t* req,
1060                      int is_fstat,
1061                      int is_lstat) {
1062   struct uv__io_uring_sqe* sqe;
1063   struct uv__statx* statxbuf;
1064   struct uv__iou* iou;
1065 
1066   statxbuf = uv__malloc(sizeof(*statxbuf));
1067   if (statxbuf == NULL)
1068     return 0;
1069 
1070   iou = &uv__get_internal_fields(loop)->iou;
1071 
1072   sqe = uv__iou_get_sqe(iou, loop, req);
1073   if (sqe == NULL) {
1074     uv__free(statxbuf);
1075     return 0;
1076   }
1077 
1078   req->ptr = statxbuf;
1079 
1080   sqe->addr = (uintptr_t) req->path;
1081   sqe->addr2 = (uintptr_t) statxbuf;
1082   sqe->fd = AT_FDCWD;
1083   sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1084   sqe->opcode = UV__IORING_OP_STATX;
1085 
1086   if (is_fstat) {
1087     sqe->addr = (uintptr_t) "";
1088     sqe->fd = req->file;
1089     sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1090   }
1091 
1092   if (is_lstat)
1093     sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1094 
1095   uv__iou_submit(iou);
1096 
1097   return 1;
1098 }
1099 
1100 
uv__statx_to_stat(const struct uv__statx * statxbuf,uv_stat_t * buf)1101 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1102   buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1103   buf->st_mode = statxbuf->stx_mode;
1104   buf->st_nlink = statxbuf->stx_nlink;
1105   buf->st_uid = statxbuf->stx_uid;
1106   buf->st_gid = statxbuf->stx_gid;
1107   buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1108   buf->st_ino = statxbuf->stx_ino;
1109   buf->st_size = statxbuf->stx_size;
1110   buf->st_blksize = statxbuf->stx_blksize;
1111   buf->st_blocks = statxbuf->stx_blocks;
1112   buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1113   buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1114   buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1115   buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1116   buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1117   buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1118   buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1119   buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1120   buf->st_flags = 0;
1121   buf->st_gen = 0;
1122 }
1123 
1124 
uv__iou_fs_statx_post(uv_fs_t * req)1125 static void uv__iou_fs_statx_post(uv_fs_t* req) {
1126   struct uv__statx* statxbuf;
1127   uv_stat_t* buf;
1128 
1129   buf = &req->statbuf;
1130   statxbuf = req->ptr;
1131   req->ptr = NULL;
1132 
1133   if (req->result == 0) {
1134     uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1135     uv__statx_to_stat(statxbuf, buf);
1136     req->ptr = buf;
1137   }
1138 
1139   uv__free(statxbuf);
1140 }
1141 
1142 
uv__poll_io_uring(uv_loop_t * loop,struct uv__iou * iou)1143 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1144   struct uv__io_uring_cqe* cqe;
1145   struct uv__io_uring_cqe* e;
1146   uv_fs_t* req;
1147   uint32_t head;
1148   uint32_t tail;
1149   uint32_t mask;
1150   uint32_t i;
1151   uint32_t flags;
1152   int nevents;
1153   int rc;
1154 
1155   head = *iou->cqhead;
1156   tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1157                               memory_order_acquire);
1158   mask = iou->cqmask;
1159   cqe = iou->cqe;
1160   nevents = 0;
1161 
1162   for (i = head; i != tail; i++) {
1163     e = &cqe[i & mask];
1164 
1165     req = (uv_fs_t*) (uintptr_t) e->user_data;
1166     assert(req->type == UV_FS);
1167 
1168     uv__req_unregister(loop);
1169     iou->in_flight--;
1170 
1171     /* If the op is not supported by the kernel retry using the thread pool */
1172     if (e->res == -EOPNOTSUPP) {
1173       uv__fs_post(loop, req);
1174       continue;
1175     }
1176 
1177     /* io_uring stores error codes as negative numbers, same as libuv. */
1178     req->result = e->res;
1179 
1180     switch (req->fs_type) {
1181       case UV_FS_FSTAT:
1182       case UV_FS_LSTAT:
1183       case UV_FS_STAT:
1184         uv__iou_fs_statx_post(req);
1185         break;
1186       default:  /* Squelch -Wswitch warnings. */
1187         break;
1188     }
1189 
1190     uv__metrics_update_idle_time(loop);
1191     req->cb(req);
1192     nevents++;
1193   }
1194 
1195   atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1196                         tail,
1197                         memory_order_release);
1198 
1199   /* Check whether CQE's overflowed, if so enter the kernel to make them
1200    * available. Don't grab them immediately but in the next loop iteration to
1201    * avoid loop starvation. */
1202   flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1203                                memory_order_acquire);
1204 
1205   if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1206     do
1207       rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1208     while (rc == -1 && errno == EINTR);
1209 
1210     if (rc < 0)
1211       perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
1212   }
1213 
1214   uv__metrics_inc_events(loop, nevents);
1215   if (uv__get_internal_fields(loop)->current_timeout == 0)
1216     uv__metrics_inc_events_waiting(loop, nevents);
1217 }
1218 
1219 
1220 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1221  * executed immediately, otherwise the file descriptor may have been closed
1222  * by the time the kernel starts the operation.
1223  */
uv__epoll_ctl_prep(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256],int op,int fd,struct epoll_event * e)1224 static void uv__epoll_ctl_prep(int epollfd,
1225                                struct uv__iou* ctl,
1226                                struct epoll_event (*events)[256],
1227                                int op,
1228                                int fd,
1229                                struct epoll_event* e) {
1230   struct uv__io_uring_sqe* sqe;
1231   struct epoll_event* pe;
1232   uint32_t mask;
1233   uint32_t slot;
1234 
1235   assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1236   assert(ctl->ringfd != -1);
1237 
1238   mask = ctl->sqmask;
1239   slot = (*ctl->sqtail)++ & mask;
1240 
1241   pe = &(*events)[slot];
1242   *pe = *e;
1243 
1244   sqe = ctl->sqe;
1245   sqe = &sqe[slot];
1246 
1247   memset(sqe, 0, sizeof(*sqe));
1248   sqe->addr = (uintptr_t) pe;
1249   sqe->fd = epollfd;
1250   sqe->len = op;
1251   sqe->off = fd;
1252   sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1253   sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1254 
1255   if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1256     uv__epoll_ctl_flush(epollfd, ctl, events);
1257 }
1258 
1259 
uv__epoll_ctl_flush(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256])1260 static void uv__epoll_ctl_flush(int epollfd,
1261                                 struct uv__iou* ctl,
1262                                 struct epoll_event (*events)[256]) {
1263   struct epoll_event oldevents[256];
1264   struct uv__io_uring_cqe* cqe;
1265   uint32_t oldslot;
1266   uint32_t slot;
1267   uint32_t n;
1268   int fd;
1269   int op;
1270   int rc;
1271 
1272   STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1273   assert(ctl->ringfd != -1);
1274   assert(*ctl->sqhead != *ctl->sqtail);
1275 
1276   n = *ctl->sqtail - *ctl->sqhead;
1277   do
1278     rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1279   while (rc == -1 && errno == EINTR);
1280 
1281   if (rc < 0)
1282     perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
1283 
1284   if (rc != (int) n)
1285     abort();
1286 
1287   assert(*ctl->sqhead == *ctl->sqtail);
1288 
1289   memcpy(oldevents, *events, sizeof(*events));
1290 
1291   /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1292    * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1293    * that we are already watching. Ignore the former and retry the latter
1294    * with EPOLL_CTL_MOD.
1295    */
1296   while (*ctl->cqhead != *ctl->cqtail) {
1297     slot = (*ctl->cqhead)++ & ctl->cqmask;
1298 
1299     cqe = ctl->cqe;
1300     cqe = &cqe[slot];
1301 
1302     if (cqe->res == 0)
1303       continue;
1304 
1305     fd = cqe->user_data >> 32;
1306     op = 3 & cqe->user_data;
1307     oldslot = 255 & (cqe->user_data >> 2);
1308 
1309     if (op == EPOLL_CTL_DEL)
1310       continue;
1311 
1312     if (op != EPOLL_CTL_ADD)
1313       abort();
1314 
1315     if (cqe->res != -EEXIST)
1316       abort();
1317 
1318     uv__epoll_ctl_prep(epollfd,
1319                        ctl,
1320                        events,
1321                        EPOLL_CTL_MOD,
1322                        fd,
1323                        &oldevents[oldslot]);
1324   }
1325 }
1326 
1327 
uv__io_poll(uv_loop_t * loop,int timeout)1328 void uv__io_poll(uv_loop_t* loop, int timeout) {
1329   uv__loop_internal_fields_t* lfields;
1330   struct epoll_event events[1024];
1331   struct epoll_event prep[256];
1332   struct uv__invalidate inv;
1333   struct epoll_event* pe;
1334   struct epoll_event e;
1335   struct uv__iou* ctl;
1336   struct uv__iou* iou;
1337   int real_timeout;
1338   struct uv__queue* q;
1339   uv__io_t* w;
1340   sigset_t* sigmask;
1341   sigset_t sigset;
1342   uint64_t base;
1343   int have_iou_events;
1344   int have_signals;
1345   int nevents;
1346   int epollfd;
1347   int count;
1348   int nfds;
1349   int fd;
1350   int op;
1351   int i;
1352   int user_timeout;
1353   int reset_timeout;
1354 
1355   lfields = uv__get_internal_fields(loop);
1356   ctl = &lfields->ctl;
1357   iou = &lfields->iou;
1358 
1359   sigmask = NULL;
1360   if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1361     sigemptyset(&sigset);
1362     sigaddset(&sigset, SIGPROF);
1363     sigmask = &sigset;
1364   }
1365 
1366   assert(timeout >= -1);
1367   base = loop->time;
1368   count = 48; /* Benchmarks suggest this gives the best throughput. */
1369   real_timeout = timeout;
1370 
1371   if (lfields->flags & UV_METRICS_IDLE_TIME) {
1372     reset_timeout = 1;
1373     user_timeout = timeout;
1374     timeout = 0;
1375   } else {
1376     reset_timeout = 0;
1377     user_timeout = 0;
1378   }
1379 
1380   epollfd = loop->backend_fd;
1381 
1382   memset(&e, 0, sizeof(e));
1383 
1384   while (!uv__queue_empty(&loop->watcher_queue)) {
1385     q = uv__queue_head(&loop->watcher_queue);
1386     w = uv__queue_data(q, uv__io_t, watcher_queue);
1387     uv__queue_remove(q);
1388     uv__queue_init(q);
1389 
1390     op = EPOLL_CTL_MOD;
1391     if (w->events == 0)
1392       op = EPOLL_CTL_ADD;
1393 
1394     w->events = w->pevents;
1395     e.events = w->pevents;
1396     if (w == &loop->async_io_watcher)
1397       /* Enable edge-triggered mode on async_io_watcher(eventfd),
1398        * so that we're able to eliminate the overhead of reading
1399        * the eventfd via system call on each event loop wakeup.
1400        */
1401       e.events |= EPOLLET;
1402     e.data.fd = w->fd;
1403     fd = w->fd;
1404 
1405     if (ctl->ringfd != -1) {
1406       uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1407       continue;
1408     }
1409 
1410     if (!epoll_ctl(epollfd, op, fd, &e))
1411       continue;
1412 
1413     assert(op == EPOLL_CTL_ADD);
1414     assert(errno == EEXIST);
1415 
1416     /* File descriptor that's been watched before, update event mask. */
1417     if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1418       abort();
1419   }
1420 
1421   inv.events = events;
1422   inv.prep = &prep;
1423   inv.nfds = -1;
1424 
1425   for (;;) {
1426     if (loop->nfds == 0)
1427       if (iou->in_flight == 0)
1428         break;
1429 
1430     /* All event mask mutations should be visible to the kernel before
1431      * we enter epoll_pwait().
1432      */
1433     if (ctl->ringfd != -1)
1434       while (*ctl->sqhead != *ctl->sqtail)
1435         uv__epoll_ctl_flush(epollfd, ctl, &prep);
1436 
1437     /* Only need to set the provider_entry_time if timeout != 0. The function
1438      * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
1439      */
1440     if (timeout != 0)
1441       uv__metrics_set_provider_entry_time(loop);
1442 
1443     /* Store the current timeout in a location that's globally accessible so
1444      * other locations like uv__work_done() can determine whether the queue
1445      * of events in the callback were waiting when poll was called.
1446      */
1447     lfields->current_timeout = timeout;
1448 
1449     nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1450 
1451     /* Update loop->time unconditionally. It's tempting to skip the update when
1452      * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
1453      * operating system didn't reschedule our process while in the syscall.
1454      */
1455     SAVE_ERRNO(uv__update_time(loop));
1456 
1457     if (nfds == -1)
1458       assert(errno == EINTR);
1459     else if (nfds == 0)
1460       /* Unlimited timeout should only return with events or signal. */
1461       assert(timeout != -1);
1462 
1463     if (nfds == 0 || nfds == -1) {
1464       if (reset_timeout != 0) {
1465         timeout = user_timeout;
1466         reset_timeout = 0;
1467       } else if (nfds == 0) {
1468         return;
1469       }
1470 
1471       /* Interrupted by a signal. Update timeout and poll again. */
1472       goto update_timeout;
1473     }
1474 
1475     have_iou_events = 0;
1476     have_signals = 0;
1477     nevents = 0;
1478 
1479     inv.nfds = nfds;
1480     lfields->inv = &inv;
1481 
1482     for (i = 0; i < nfds; i++) {
1483       pe = events + i;
1484       fd = pe->data.fd;
1485 
1486       /* Skip invalidated events, see uv__platform_invalidate_fd */
1487       if (fd == -1)
1488         continue;
1489 
1490       if (fd == iou->ringfd) {
1491         uv__poll_io_uring(loop, iou);
1492         have_iou_events = 1;
1493         continue;
1494       }
1495 
1496       assert(fd >= 0);
1497       assert((unsigned) fd < loop->nwatchers);
1498 
1499       w = loop->watchers[fd];
1500 
1501       if (w == NULL) {
1502         /* File descriptor that we've stopped watching, disarm it.
1503          *
1504          * Ignore all errors because we may be racing with another thread
1505          * when the file descriptor is closed.
1506          *
1507          * Perform EPOLL_CTL_DEL immediately instead of going through
1508          * io_uring's submit queue, otherwise the file descriptor may
1509          * be closed by the time the kernel starts the operation.
1510          */
1511         epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1512         continue;
1513       }
1514 
1515       /* Give users only events they're interested in. Prevents spurious
1516        * callbacks when previous callback invocation in this loop has stopped
1517        * the current watcher. Also, filters out events that users has not
1518        * requested us to watch.
1519        */
1520       pe->events &= w->pevents | POLLERR | POLLHUP;
1521 
1522       /* Work around an epoll quirk where it sometimes reports just the
1523        * EPOLLERR or EPOLLHUP event.  In order to force the event loop to
1524        * move forward, we merge in the read/write events that the watcher
1525        * is interested in; uv__read() and uv__write() will then deal with
1526        * the error or hangup in the usual fashion.
1527        *
1528        * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1529        * reads the available data, calls uv_read_stop(), then sometime later
1530        * calls uv_read_start() again.  By then, libuv has forgotten about the
1531        * hangup and the kernel won't report EPOLLIN again because there's
1532        * nothing left to read.  If anything, libuv is to blame here.  The
1533        * current hack is just a quick bandaid; to properly fix it, libuv
1534        * needs to remember the error/hangup event.  We should get that for
1535        * free when we switch over to edge-triggered I/O.
1536        */
1537       if (pe->events == POLLERR || pe->events == POLLHUP)
1538         pe->events |=
1539           w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1540 
1541       if (pe->events != 0) {
1542         /* Run signal watchers last.  This also affects child process watchers
1543          * because those are implemented in terms of signal watchers.
1544          */
1545         if (w == &loop->signal_io_watcher) {
1546           have_signals = 1;
1547         } else {
1548           uv__metrics_update_idle_time(loop);
1549           w->cb(loop, w, pe->events);
1550         }
1551 
1552         nevents++;
1553       }
1554     }
1555 
1556     uv__metrics_inc_events(loop, nevents);
1557     if (reset_timeout != 0) {
1558       timeout = user_timeout;
1559       reset_timeout = 0;
1560       uv__metrics_inc_events_waiting(loop, nevents);
1561     }
1562 
1563     if (have_signals != 0) {
1564       uv__metrics_update_idle_time(loop);
1565       loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
1566     }
1567 
1568     lfields->inv = NULL;
1569 
1570     if (have_iou_events != 0)
1571       break;  /* Event loop should cycle now so don't poll again. */
1572 
1573     if (have_signals != 0)
1574       break;  /* Event loop should cycle now so don't poll again. */
1575 
1576     if (nevents != 0) {
1577       if (nfds == ARRAY_SIZE(events) && --count != 0) {
1578         /* Poll for more events but don't block this time. */
1579         timeout = 0;
1580         continue;
1581       }
1582       break;
1583     }
1584 
1585 update_timeout:
1586     if (timeout == 0)
1587       break;
1588 
1589     if (timeout == -1)
1590       continue;
1591 
1592     assert(timeout > 0);
1593 
1594     real_timeout -= (loop->time - base);
1595     if (real_timeout <= 0)
1596       break;
1597 
1598     timeout = real_timeout;
1599   }
1600 
1601   if (ctl->ringfd != -1)
1602     while (*ctl->sqhead != *ctl->sqtail)
1603       uv__epoll_ctl_flush(epollfd, ctl, &prep);
1604 }
1605 
uv__hrtime(uv_clocktype_t type)1606 uint64_t uv__hrtime(uv_clocktype_t type) {
1607   static _Atomic clock_t fast_clock_id = -1;
1608   struct timespec t;
1609   clock_t clock_id;
1610 
1611   /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1612    * millisecond granularity or better.  CLOCK_MONOTONIC_COARSE is
1613    * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1614    * decide to make a costly system call.
1615    */
1616   /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1617    * when it has microsecond granularity or better (unlikely).
1618    */
1619   clock_id = CLOCK_MONOTONIC;
1620   if (type != UV_CLOCK_FAST)
1621     goto done;
1622 
1623   clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1624   if (clock_id != -1)
1625     goto done;
1626 
1627   clock_id = CLOCK_MONOTONIC;
1628   if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1629     if (t.tv_nsec <= 1 * 1000 * 1000)
1630       clock_id = CLOCK_MONOTONIC_COARSE;
1631 
1632   atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1633 
1634 done:
1635 
1636   if (clock_gettime(clock_id, &t))
1637     return 0;  /* Not really possible. */
1638 
1639   return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1640 }
1641 
1642 
uv_resident_set_memory(size_t * rss)1643 int uv_resident_set_memory(size_t* rss) {
1644   char buf[1024];
1645   const char* s;
1646   long val;
1647   int rc;
1648   int i;
1649 
1650   /* rss: 24th element */
1651   rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1652   if (rc < 0)
1653     return rc;
1654 
1655   /* find the last ')' */
1656   s = strrchr(buf, ')');
1657   if (s == NULL)
1658     goto err;
1659 
1660   for (i = 1; i <= 22; i++) {
1661     s = strchr(s + 1, ' ');
1662     if (s == NULL)
1663       goto err;
1664   }
1665 
1666   errno = 0;
1667   val = strtol(s, NULL, 10);
1668   if (val < 0 || errno != 0)
1669     goto err;
1670 
1671   *rss = val * getpagesize();
1672   return 0;
1673 
1674 err:
1675   return UV_EINVAL;
1676 }
1677 
uv_uptime(double * uptime)1678 int uv_uptime(double* uptime) {
1679   struct timespec now;
1680   char buf[128];
1681 
1682   /* Consult /proc/uptime when present (common case), or fall back to
1683    * clock_gettime. Why not always clock_gettime? It doesn't always return the
1684    * right result under OpenVZ and possibly other containerized environments.
1685    */
1686   if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1687     if (1 == sscanf(buf, "%lf", uptime))
1688       return 0;
1689 
1690   if (clock_gettime(CLOCK_BOOTTIME, &now))
1691     return UV__ERR(errno);
1692 
1693   *uptime = now.tv_sec;
1694   return 0;
1695 }
1696 
1697 
uv_cpu_info(uv_cpu_info_t ** ci,int * count)1698 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1699 #if defined(__PPC__)
1700   static const char model_marker[] = "cpu\t\t: ";
1701 #elif defined(__arm__)
1702   static const char model_marker[] = "Processor\t: ";
1703 #elif defined(__aarch64__)
1704   static const char model_marker[] = "CPU part\t: ";
1705 #elif defined(__mips__)
1706   static const char model_marker[] = "cpu model\t\t: ";
1707 #elif defined(__loongarch__)
1708   static const char model_marker[] = "cpu family\t\t: ";
1709 #else
1710   static const char model_marker[] = "model name\t: ";
1711 #endif
1712   static const char parts[] =
1713 #ifdef __aarch64__
1714     "0x811\nARM810\n"       "0x920\nARM920\n"      "0x922\nARM922\n"
1715     "0x926\nARM926\n"       "0x940\nARM940\n"      "0x946\nARM946\n"
1716     "0x966\nARM966\n"       "0xa20\nARM1020\n"      "0xa22\nARM1022\n"
1717     "0xa26\nARM1026\n"      "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1718     "0xb56\nARM1156\n"      "0xb76\nARM1176\n"      "0xc05\nCortex-A5\n"
1719     "0xc07\nCortex-A7\n"    "0xc08\nCortex-A8\n"    "0xc09\nCortex-A9\n"
1720     "0xc0d\nCortex-A17\n"   /* Originally A12 */
1721     "0xc0f\nCortex-A15\n"   "0xc0e\nCortex-A17\n"   "0xc14\nCortex-R4\n"
1722     "0xc15\nCortex-R5\n"    "0xc17\nCortex-R7\n"    "0xc18\nCortex-R8\n"
1723     "0xc20\nCortex-M0\n"    "0xc21\nCortex-M1\n"    "0xc23\nCortex-M3\n"
1724     "0xc24\nCortex-M4\n"    "0xc27\nCortex-M7\n"    "0xc60\nCortex-M0+\n"
1725     "0xd01\nCortex-A32\n"   "0xd03\nCortex-A53\n"   "0xd04\nCortex-A35\n"
1726     "0xd05\nCortex-A55\n"   "0xd06\nCortex-A65\n"   "0xd07\nCortex-A57\n"
1727     "0xd08\nCortex-A72\n"   "0xd09\nCortex-A73\n"   "0xd0a\nCortex-A75\n"
1728     "0xd0b\nCortex-A76\n"   "0xd0c\nNeoverse-N1\n"  "0xd0d\nCortex-A77\n"
1729     "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n"   "0xd20\nCortex-M23\n"
1730     "0xd21\nCortex-M33\n"   "0xd41\nCortex-A78\n"   "0xd42\nCortex-A78AE\n"
1731     "0xd4a\nNeoverse-E1\n"  "0xd4b\nCortex-A78C\n"
1732 #endif
1733     "";
1734   struct cpu {
1735     unsigned long long freq, user, nice, sys, idle, irq;
1736     unsigned model;
1737   };
1738   FILE* fp;
1739   char* p;
1740   int found;
1741   int n;
1742   unsigned i;
1743   unsigned cpu;
1744   unsigned maxcpu;
1745   unsigned size;
1746   unsigned long long skip;
1747   struct cpu (*cpus)[8192];  /* Kernel maximum. */
1748   struct cpu* c;
1749   struct cpu t;
1750   char (*model)[64];
1751   unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1752   /* Assumption: even big.LITTLE systems will have only a handful
1753    * of different CPU models. Most systems will just have one.
1754    */
1755   char models[8][64];
1756   char buf[1024];
1757 
1758   memset(bitmap, 0, sizeof(bitmap));
1759   memset(models, 0, sizeof(models));
1760   snprintf(*models, sizeof(*models), "unknown");
1761   maxcpu = 0;
1762 
1763   cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1764   if (cpus == NULL)
1765     return UV_ENOMEM;
1766 
1767   fp = uv__open_file("/proc/stat");
1768   if (fp == NULL) {
1769     uv__free(cpus);
1770     return UV__ERR(errno);
1771   }
1772 
1773   if (NULL == fgets(buf, sizeof(buf), fp))
1774     abort();
1775 
1776   for (;;) {
1777     memset(&t, 0, sizeof(t));
1778 
1779     n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1780                &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1781 
1782     if (n != 7)
1783       break;
1784 
1785     if (NULL == fgets(buf, sizeof(buf), fp))
1786       abort();
1787 
1788     if (cpu >= ARRAY_SIZE(*cpus))
1789       continue;
1790 
1791     (*cpus)[cpu] = t;
1792 
1793     bitmap[cpu >> 3] |= 1 << (cpu & 7);
1794 
1795     if (cpu >= maxcpu)
1796       maxcpu = cpu + 1;
1797   }
1798 
1799   fclose(fp);
1800 
1801   fp = uv__open_file("/proc/cpuinfo");
1802   if (fp == NULL)
1803     goto nocpuinfo;
1804 
1805   for (;;) {
1806     if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1807       break;  /* Parse error. */
1808 
1809     found = 0;
1810     while (!found && fgets(buf, sizeof(buf), fp))
1811       found = !strncmp(buf, model_marker, sizeof(model_marker) - 1);
1812 
1813     if (!found)
1814       goto next;
1815 
1816     p = buf + sizeof(model_marker) - 1;
1817     n = (int) strcspn(p, "\n");
1818 
1819     /* arm64: translate CPU part code to model name. */
1820     if (*parts) {
1821       p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1822       if (p == NULL)
1823         p = "unknown";
1824       else
1825         p += n + 1;
1826       n = (int) strcspn(p, "\n");
1827     }
1828 
1829     found = 0;
1830     for (model = models; !found && model < ARRAY_END(models); model++)
1831       found = !strncmp(p, *model, strlen(*model));
1832 
1833     if (!found)
1834       goto next;
1835 
1836     if (**model == '\0')
1837       snprintf(*model, sizeof(*model), "%.*s", n, p);
1838 
1839     if (cpu < maxcpu)
1840       (*cpus)[cpu].model = model - models;
1841 
1842 next:
1843     while (fgets(buf, sizeof(buf), fp))
1844       if (*buf == '\n')
1845         break;
1846   }
1847 
1848   fclose(fp);
1849   fp = NULL;
1850 
1851 nocpuinfo:
1852 
1853   n = 0;
1854   for (cpu = 0; cpu < maxcpu; cpu++) {
1855     if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1856       continue;
1857 
1858     n++;
1859     snprintf(buf, sizeof(buf),
1860              "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1861 
1862     fp = uv__open_file(buf);
1863     if (fp == NULL)
1864       continue;
1865 
1866     if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1867       abort();
1868     fclose(fp);
1869     fp = NULL;
1870   }
1871 
1872   size = n * sizeof(**ci) + sizeof(models);
1873   *ci = uv__malloc(size);
1874   *count = 0;
1875 
1876   if (*ci == NULL) {
1877     uv__free(cpus);
1878     return UV_ENOMEM;
1879   }
1880 
1881   *count = n;
1882   p = memcpy(*ci + n, models, sizeof(models));
1883 
1884   i = 0;
1885   for (cpu = 0; cpu < maxcpu; cpu++) {
1886     if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1887       continue;
1888 
1889     c = *cpus + cpu;
1890 
1891     (*ci)[i++] = (uv_cpu_info_t) {
1892       .model     = p + c->model * sizeof(*model),
1893       .speed     = c->freq / 1000,
1894       /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1895        * therefore the multiplier is always 1000/100 = 10.
1896        */
1897       .cpu_times = (struct uv_cpu_times_s) {
1898         .user = 10 * c->user,
1899         .nice = 10 * c->nice,
1900         .sys  = 10 * c->sys,
1901         .idle = 10 * c->idle,
1902         .irq  = 10 * c->irq,
1903       },
1904     };
1905   }
1906 
1907   uv__free(cpus);
1908 
1909   return 0;
1910 }
1911 
1912 
uv__ifaddr_exclude(struct ifaddrs * ent,int exclude_type)1913 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1914   if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1915     return 1;
1916   if (ent->ifa_addr == NULL)
1917     return 1;
1918   /*
1919    * On Linux getifaddrs returns information related to the raw underlying
1920    * devices. We're not interested in this information yet.
1921    */
1922   if (ent->ifa_addr->sa_family == PF_PACKET)
1923     return exclude_type;
1924   return !exclude_type;
1925 }
1926 
uv_interface_addresses(uv_interface_address_t ** addresses,int * count)1927 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1928   struct ifaddrs *addrs, *ent;
1929   uv_interface_address_t* address;
1930   int i;
1931   struct sockaddr_ll *sll;
1932 
1933   *count = 0;
1934   *addresses = NULL;
1935 
1936   if (getifaddrs(&addrs))
1937     return UV__ERR(errno);
1938 
1939   /* Count the number of interfaces */
1940   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1941     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1942       continue;
1943 
1944     (*count)++;
1945   }
1946 
1947   if (*count == 0) {
1948     freeifaddrs(addrs);
1949     return 0;
1950   }
1951 
1952   /* Make sure the memory is initiallized to zero using calloc() */
1953   *addresses = uv__calloc(*count, sizeof(**addresses));
1954   if (!(*addresses)) {
1955     freeifaddrs(addrs);
1956     return UV_ENOMEM;
1957   }
1958 
1959   address = *addresses;
1960 
1961   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1962     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1963       continue;
1964 
1965     address->name = uv__strdup(ent->ifa_name);
1966 
1967     if (ent->ifa_addr->sa_family == AF_INET6) {
1968       address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
1969     } else {
1970       address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
1971     }
1972 
1973     if (ent->ifa_netmask->sa_family == AF_INET6) {
1974       address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
1975     } else {
1976       address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
1977     }
1978 
1979     address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
1980 
1981     address++;
1982   }
1983 
1984   /* Fill in physical addresses for each interface */
1985   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1986     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
1987       continue;
1988 
1989     address = *addresses;
1990 
1991     for (i = 0; i < (*count); i++) {
1992       size_t namelen = strlen(ent->ifa_name);
1993       /* Alias interface share the same physical address */
1994       if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
1995           (address->name[namelen] == 0 || address->name[namelen] == ':')) {
1996         sll = (struct sockaddr_ll*)ent->ifa_addr;
1997         memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
1998       }
1999       address++;
2000     }
2001   }
2002 
2003   freeifaddrs(addrs);
2004 
2005   return 0;
2006 }
2007 
2008 
uv_free_interface_addresses(uv_interface_address_t * addresses,int count)2009 void uv_free_interface_addresses(uv_interface_address_t* addresses,
2010   int count) {
2011   int i;
2012 
2013   for (i = 0; i < count; i++) {
2014     uv__free(addresses[i].name);
2015   }
2016 
2017   uv__free(addresses);
2018 }
2019 
2020 
uv__set_process_title(const char * title)2021 void uv__set_process_title(const char* title) {
2022 #if defined(PR_SET_NAME)
2023   prctl(PR_SET_NAME, title);  /* Only copies first 16 characters. */
2024 #endif
2025 }
2026 
2027 
uv__read_proc_meminfo(const char * what)2028 static uint64_t uv__read_proc_meminfo(const char* what) {
2029   uint64_t rc;
2030   char* p;
2031   char buf[4096];  /* Large enough to hold all of /proc/meminfo. */
2032 
2033   if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2034     return 0;
2035 
2036   p = strstr(buf, what);
2037 
2038   if (p == NULL)
2039     return 0;
2040 
2041   p += strlen(what);
2042 
2043   rc = 0;
2044   sscanf(p, "%" PRIu64 " kB", &rc);
2045 
2046   return rc * 1024;
2047 }
2048 
2049 
uv_get_free_memory(void)2050 uint64_t uv_get_free_memory(void) {
2051   struct sysinfo info;
2052   uint64_t rc;
2053 
2054   rc = uv__read_proc_meminfo("MemAvailable:");
2055 
2056   if (rc != 0)
2057     return rc;
2058 
2059   if (0 == sysinfo(&info))
2060     return (uint64_t) info.freeram * info.mem_unit;
2061 
2062   return 0;
2063 }
2064 
2065 
uv_get_total_memory(void)2066 uint64_t uv_get_total_memory(void) {
2067   struct sysinfo info;
2068   uint64_t rc;
2069 
2070   rc = uv__read_proc_meminfo("MemTotal:");
2071 
2072   if (rc != 0)
2073     return rc;
2074 
2075   if (0 == sysinfo(&info))
2076     return (uint64_t) info.totalram * info.mem_unit;
2077 
2078   return 0;
2079 }
2080 
2081 
uv__read_uint64(const char * filename)2082 static uint64_t uv__read_uint64(const char* filename) {
2083   char buf[32];  /* Large enough to hold an encoded uint64_t. */
2084   uint64_t rc;
2085 
2086   rc = 0;
2087   if (0 == uv__slurp(filename, buf, sizeof(buf)))
2088     if (1 != sscanf(buf, "%" PRIu64, &rc))
2089       if (0 == strcmp(buf, "max\n"))
2090         rc = UINT64_MAX;
2091 
2092   return rc;
2093 }
2094 
2095 
2096 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2097  * finds the location and length of the memory controller mount path.
2098  * This disregards the leading / for easy concatenation of paths.
2099  * Returns NULL if the memory controller wasn't found. */
uv__cgroup1_find_memory_controller(char buf[static1024],int * n)2100 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2101                                                 int* n) {
2102   char* p;
2103 
2104   /* Seek to the memory controller line. */
2105   p = strchr(buf, ':');
2106   while (p != NULL && strncmp(p, ":memory:", 8)) {
2107     p = strchr(p, '\n');
2108     if (p != NULL)
2109       p = strchr(p, ':');
2110   }
2111 
2112   if (p != NULL) {
2113     /* Determine the length of the mount path. */
2114     p = p + strlen(":memory:/");
2115     *n = (int) strcspn(p, "\n");
2116   }
2117 
2118   return p;
2119 }
2120 
uv__get_cgroup1_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2121 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2122                                           uint64_t* max) {
2123   char filename[4097];
2124   char* p;
2125   int n;
2126   uint64_t cgroup1_max;
2127 
2128   /* Find out where the controller is mounted. */
2129   p = uv__cgroup1_find_memory_controller(buf, &n);
2130   if (p != NULL) {
2131     snprintf(filename, sizeof(filename),
2132              "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2133     *high = uv__read_uint64(filename);
2134 
2135     snprintf(filename, sizeof(filename),
2136              "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2137     *max = uv__read_uint64(filename);
2138 
2139     /* If the controller wasn't mounted, the reads above will have failed,
2140      * as indicated by uv__read_uint64 returning 0.
2141      */
2142      if (*high != 0 && *max != 0)
2143        goto update_limits;
2144   }
2145 
2146   /* Fall back to the limits of the global memory controller. */
2147   *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2148   *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2149 
2150   /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2151    * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2152    */
2153 update_limits:
2154   cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2155   if (*high == cgroup1_max)
2156     *high = UINT64_MAX;
2157   if (*max == cgroup1_max)
2158     *max = UINT64_MAX;
2159 }
2160 
uv__get_cgroup2_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2161 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2162                                           uint64_t* max) {
2163   char filename[4097];
2164   char* p;
2165   int n;
2166 
2167   /* Find out where the controller is mounted. */
2168   p = buf + strlen("0::/");
2169   n = (int) strcspn(p, "\n");
2170 
2171   /* Read the memory limits of the controller. */
2172   snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2173   *max = uv__read_uint64(filename);
2174   snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2175   *high = uv__read_uint64(filename);
2176 }
2177 
uv__get_cgroup_constrained_memory(char buf[static1024])2178 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2179   uint64_t high;
2180   uint64_t max;
2181 
2182   /* In the case of cgroupv2, we'll only have a single entry. */
2183   if (strncmp(buf, "0::/", 4))
2184     uv__get_cgroup1_memory_limits(buf, &high, &max);
2185   else
2186     uv__get_cgroup2_memory_limits(buf, &high, &max);
2187 
2188   if (high == 0 || max == 0)
2189     return 0;
2190 
2191   return high < max ? high : max;
2192 }
2193 
uv_get_constrained_memory(void)2194 uint64_t uv_get_constrained_memory(void) {
2195   char buf[1024];
2196 
2197   if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2198     return 0;
2199 
2200   return uv__get_cgroup_constrained_memory(buf);
2201 }
2202 
2203 
uv__get_cgroup1_current_memory(char buf[static1024])2204 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2205   char filename[4097];
2206   uint64_t current;
2207   char* p;
2208   int n;
2209 
2210   /* Find out where the controller is mounted. */
2211   p = uv__cgroup1_find_memory_controller(buf, &n);
2212   if (p != NULL) {
2213     snprintf(filename, sizeof(filename),
2214             "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2215     current = uv__read_uint64(filename);
2216 
2217     /* If the controller wasn't mounted, the reads above will have failed,
2218      * as indicated by uv__read_uint64 returning 0.
2219      */
2220     if (current != 0)
2221       return current;
2222   }
2223 
2224   /* Fall back to the usage of the global memory controller. */
2225   return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2226 }
2227 
uv__get_cgroup2_current_memory(char buf[static1024])2228 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2229   char filename[4097];
2230   char* p;
2231   int n;
2232 
2233   /* Find out where the controller is mounted. */
2234   p = buf + strlen("0::/");
2235   n = (int) strcspn(p, "\n");
2236 
2237   snprintf(filename, sizeof(filename),
2238            "/sys/fs/cgroup/%.*s/memory.current", n, p);
2239   return uv__read_uint64(filename);
2240 }
2241 
uv_get_available_memory(void)2242 uint64_t uv_get_available_memory(void) {
2243   char buf[1024];
2244   uint64_t constrained;
2245   uint64_t current;
2246   uint64_t total;
2247 
2248   if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2249     return 0;
2250 
2251   constrained = uv__get_cgroup_constrained_memory(buf);
2252   if (constrained == 0)
2253     return uv_get_free_memory();
2254 
2255   total = uv_get_total_memory();
2256   if (constrained > total)
2257     return uv_get_free_memory();
2258 
2259   /* In the case of cgroupv2, we'll only have a single entry. */
2260   if (strncmp(buf, "0::/", 4))
2261     current = uv__get_cgroup1_current_memory(buf);
2262   else
2263     current = uv__get_cgroup2_current_memory(buf);
2264 
2265   /* memory usage can be higher than the limit (for short bursts of time) */
2266   if (constrained < current)
2267     return 0;
2268 
2269   return constrained - current;
2270 }
2271 
2272 
uv__get_cgroupv2_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2273 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2274                                             uv__cpu_constraint* constraint) {
2275   char path[256];
2276   char buf[1024];
2277   unsigned int weight;
2278   int cgroup_size;
2279   const char* cgroup_trimmed;
2280   char quota_buf[16];
2281 
2282   if (strncmp(cgroup, "0::/", 4) != 0)
2283     return UV_EINVAL;
2284 
2285   /* Trim ending \n by replacing it with a 0 */
2286   cgroup_trimmed = cgroup + sizeof("0::/") - 1;      /* Skip the prefix "0::/" */
2287   cgroup_size = (int)strcspn(cgroup_trimmed, "\n");  /* Find the first slash */
2288 
2289   /* Construct the path to the cpu.max file */
2290   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.max", cgroup_size,
2291            cgroup_trimmed);
2292 
2293   /* Read cpu.max */
2294   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2295     return UV_EIO;
2296 
2297   if (sscanf(buf, "%15s %llu", quota_buf, &constraint->period_length) != 2)
2298     return UV_EINVAL;
2299 
2300   if (strncmp(quota_buf, "max", 3) == 0)
2301     constraint->quota_per_period = LLONG_MAX;
2302   else if (sscanf(quota_buf, "%lld", &constraint->quota_per_period) != 1)
2303     return UV_EINVAL; // conversion failed
2304 
2305   /* Construct the path to the cpu.weight file */
2306   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.weight", cgroup_size,
2307            cgroup_trimmed);
2308 
2309   /* Read cpu.weight */
2310   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2311     return UV_EIO;
2312 
2313   if (sscanf(buf, "%u", &weight) != 1)
2314     return UV_EINVAL;
2315 
2316   constraint->proportions = (double)weight / 100.0;
2317 
2318   return 0;
2319 }
2320 
uv__cgroup1_find_cpu_controller(const char * cgroup,int * cgroup_size)2321 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2322                                              int* cgroup_size) {
2323   /* Seek to the cpu controller line. */
2324   char* cgroup_cpu = strstr(cgroup, ":cpu,");
2325 
2326   if (cgroup_cpu != NULL) {
2327     /* Skip the controller prefix to the start of the cgroup path. */
2328     cgroup_cpu += sizeof(":cpu,") - 1;
2329     /* Determine the length of the cgroup path, excluding the newline. */
2330     *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2331   }
2332 
2333   return cgroup_cpu;
2334 }
2335 
uv__get_cgroupv1_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2336 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2337                                             uv__cpu_constraint* constraint) {
2338   char path[256];
2339   char buf[1024];
2340   unsigned int shares;
2341   int cgroup_size;
2342   char* cgroup_cpu;
2343 
2344   cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2345 
2346   if (cgroup_cpu == NULL)
2347     return UV_EIO;
2348 
2349   /* Construct the path to the cpu.cfs_quota_us file */
2350   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2351            cgroup_size, cgroup_cpu);
2352 
2353   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2354     return UV_EIO;
2355 
2356   if (sscanf(buf, "%lld", &constraint->quota_per_period) != 1)
2357     return UV_EINVAL;
2358 
2359   /* Construct the path to the cpu.cfs_period_us file */
2360   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2361            cgroup_size, cgroup_cpu);
2362 
2363   /* Read cpu.cfs_period_us */
2364   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2365     return UV_EIO;
2366 
2367   if (sscanf(buf, "%lld", &constraint->period_length) != 1)
2368     return UV_EINVAL;
2369 
2370   /* Construct the path to the cpu.shares file */
2371   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.shares", cgroup_size,
2372            cgroup_cpu);
2373 
2374   /* Read cpu.shares */
2375   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2376     return UV_EIO;
2377 
2378   if (sscanf(buf, "%u", &shares) != 1)
2379     return UV_EINVAL;
2380 
2381   constraint->proportions = (double)shares / 1024.0;
2382 
2383   return 0;
2384 }
2385 
uv__get_constrained_cpu(uv__cpu_constraint * constraint)2386 int uv__get_constrained_cpu(uv__cpu_constraint* constraint) {
2387   char cgroup[1024];
2388 
2389   /* Read the cgroup from /proc/self/cgroup */
2390   if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2391     return UV_EIO;
2392 
2393   /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2394    * The entry for cgroup v2 is always in the format "0::$PATH"
2395    * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2396   if (strncmp(cgroup, "0::/", 4) == 0)
2397     return uv__get_cgroupv2_constrained_cpu(cgroup, constraint);
2398   else
2399     return uv__get_cgroupv1_constrained_cpu(cgroup, constraint);
2400 }
2401 
2402 
uv_loadavg(double avg[3])2403 void uv_loadavg(double avg[3]) {
2404   struct sysinfo info;
2405   char buf[128];  /* Large enough to hold all of /proc/loadavg. */
2406 
2407   if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2408     if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2409       return;
2410 
2411   if (sysinfo(&info) < 0)
2412     return;
2413 
2414   avg[0] = (double) info.loads[0] / 65536.0;
2415   avg[1] = (double) info.loads[1] / 65536.0;
2416   avg[2] = (double) info.loads[2] / 65536.0;
2417 }
2418 
2419 
compare_watchers(const struct watcher_list * a,const struct watcher_list * b)2420 static int compare_watchers(const struct watcher_list* a,
2421                             const struct watcher_list* b) {
2422   if (a->wd < b->wd) return -1;
2423   if (a->wd > b->wd) return 1;
2424   return 0;
2425 }
2426 
2427 
init_inotify(uv_loop_t * loop)2428 static int init_inotify(uv_loop_t* loop) {
2429   int fd;
2430 
2431   if (loop->inotify_fd != -1)
2432     return 0;
2433 
2434   fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2435   if (fd < 0)
2436     return UV__ERR(errno);
2437 
2438   loop->inotify_fd = fd;
2439   uv__io_init(&loop->inotify_read_watcher, uv__inotify_read, loop->inotify_fd);
2440   uv__io_start(loop, &loop->inotify_read_watcher, POLLIN);
2441 
2442   return 0;
2443 }
2444 
2445 
uv__inotify_fork(uv_loop_t * loop,struct watcher_list * root)2446 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2447   /* Open the inotify_fd, and re-arm all the inotify watchers. */
2448   int err;
2449   struct watcher_list* tmp_watcher_list_iter;
2450   struct watcher_list* watcher_list;
2451   struct watcher_list tmp_watcher_list;
2452   struct uv__queue queue;
2453   struct uv__queue* q;
2454   uv_fs_event_t* handle;
2455   char* tmp_path;
2456 
2457   if (root == NULL)
2458     return 0;
2459 
2460   /* We must restore the old watcher list to be able to close items
2461    * out of it.
2462    */
2463   loop->inotify_watchers = root;
2464 
2465   uv__queue_init(&tmp_watcher_list.watchers);
2466   /* Note that the queue we use is shared with the start and stop()
2467    * functions, making uv__queue_foreach unsafe to use. So we use the
2468    * uv__queue_move trick to safely iterate. Also don't free the watcher
2469    * list until we're done iterating. c.f. uv__inotify_read.
2470    */
2471   RB_FOREACH_SAFE(watcher_list, watcher_root,
2472                   uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2473     watcher_list->iterating = 1;
2474     uv__queue_move(&watcher_list->watchers, &queue);
2475     while (!uv__queue_empty(&queue)) {
2476       q = uv__queue_head(&queue);
2477       handle = uv__queue_data(q, uv_fs_event_t, watchers);
2478       /* It's critical to keep a copy of path here, because it
2479        * will be set to NULL by stop() and then deallocated by
2480        * maybe_free_watcher_list
2481        */
2482       tmp_path = uv__strdup(handle->path);
2483       assert(tmp_path != NULL);
2484       uv__queue_remove(q);
2485       uv__queue_insert_tail(&watcher_list->watchers, q);
2486       uv_fs_event_stop(handle);
2487 
2488       uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2489       handle->path = tmp_path;
2490     }
2491     watcher_list->iterating = 0;
2492     maybe_free_watcher_list(watcher_list, loop);
2493   }
2494 
2495   uv__queue_move(&tmp_watcher_list.watchers, &queue);
2496   while (!uv__queue_empty(&queue)) {
2497       q = uv__queue_head(&queue);
2498       uv__queue_remove(q);
2499       handle = uv__queue_data(q, uv_fs_event_t, watchers);
2500       tmp_path = handle->path;
2501       handle->path = NULL;
2502       err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2503       uv__free(tmp_path);
2504       if (err)
2505         return err;
2506   }
2507 
2508   return 0;
2509 }
2510 
2511 
find_watcher(uv_loop_t * loop,int wd)2512 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2513   struct watcher_list w;
2514   w.wd = wd;
2515   return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2516 }
2517 
2518 
maybe_free_watcher_list(struct watcher_list * w,uv_loop_t * loop)2519 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2520   /* if the watcher_list->watchers is being iterated over, we can't free it. */
2521   if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2522     /* No watchers left for this path. Clean up. */
2523     RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2524     inotify_rm_watch(loop->inotify_fd, w->wd);
2525     uv__free(w);
2526   }
2527 }
2528 
2529 
uv__inotify_read(uv_loop_t * loop,uv__io_t * dummy,unsigned int events)2530 static void uv__inotify_read(uv_loop_t* loop,
2531                              uv__io_t* dummy,
2532                              unsigned int events) {
2533   const struct inotify_event* e;
2534   struct watcher_list* w;
2535   uv_fs_event_t* h;
2536   struct uv__queue queue;
2537   struct uv__queue* q;
2538   const char* path;
2539   ssize_t size;
2540   const char *p;
2541   /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2542   char buf[4096];
2543 
2544   for (;;) {
2545     do
2546       size = read(loop->inotify_fd, buf, sizeof(buf));
2547     while (size == -1 && errno == EINTR);
2548 
2549     if (size == -1) {
2550       assert(errno == EAGAIN || errno == EWOULDBLOCK);
2551       break;
2552     }
2553 
2554     assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2555 
2556     /* Now we have one or more inotify_event structs. */
2557     for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2558       e = (const struct inotify_event*) p;
2559 
2560       events = 0;
2561       if (e->mask & (IN_ATTRIB|IN_MODIFY))
2562         events |= UV_CHANGE;
2563       if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2564         events |= UV_RENAME;
2565 
2566       w = find_watcher(loop, e->wd);
2567       if (w == NULL)
2568         continue; /* Stale event, no watchers left. */
2569 
2570       /* inotify does not return the filename when monitoring a single file
2571        * for modifications. Repurpose the filename for API compatibility.
2572        * I'm not convinced this is a good thing, maybe it should go.
2573        */
2574       path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2575 
2576       /* We're about to iterate over the queue and call user's callbacks.
2577        * What can go wrong?
2578        * A callback could call uv_fs_event_stop()
2579        * and the queue can change under our feet.
2580        * So, we use uv__queue_move() trick to safely iterate over the queue.
2581        * And we don't free the watcher_list until we're done iterating.
2582        *
2583        * First,
2584        * tell uv_fs_event_stop() (that could be called from a user's callback)
2585        * not to free watcher_list.
2586        */
2587       w->iterating = 1;
2588       uv__queue_move(&w->watchers, &queue);
2589       while (!uv__queue_empty(&queue)) {
2590         q = uv__queue_head(&queue);
2591         h = uv__queue_data(q, uv_fs_event_t, watchers);
2592 
2593         uv__queue_remove(q);
2594         uv__queue_insert_tail(&w->watchers, q);
2595 
2596         h->cb(h, path, events, 0);
2597       }
2598       /* done iterating, time to (maybe) free empty watcher_list */
2599       w->iterating = 0;
2600       maybe_free_watcher_list(w, loop);
2601     }
2602   }
2603 }
2604 
2605 
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)2606 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2607   uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2608   return 0;
2609 }
2610 
2611 
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)2612 int uv_fs_event_start(uv_fs_event_t* handle,
2613                       uv_fs_event_cb cb,
2614                       const char* path,
2615                       unsigned int flags) {
2616   struct watcher_list* w;
2617   uv_loop_t* loop;
2618   size_t len;
2619   int events;
2620   int err;
2621   int wd;
2622 
2623   if (uv__is_active(handle))
2624     return UV_EINVAL;
2625 
2626   loop = handle->loop;
2627 
2628   err = init_inotify(loop);
2629   if (err)
2630     return err;
2631 
2632   events = IN_ATTRIB
2633          | IN_CREATE
2634          | IN_MODIFY
2635          | IN_DELETE
2636          | IN_DELETE_SELF
2637          | IN_MOVE_SELF
2638          | IN_MOVED_FROM
2639          | IN_MOVED_TO;
2640 
2641   wd = inotify_add_watch(loop->inotify_fd, path, events);
2642   if (wd == -1)
2643     return UV__ERR(errno);
2644 
2645   w = find_watcher(loop, wd);
2646   if (w)
2647     goto no_insert;
2648 
2649   len = strlen(path) + 1;
2650   w = uv__malloc(sizeof(*w) + len);
2651   if (w == NULL)
2652     return UV_ENOMEM;
2653 
2654   w->wd = wd;
2655   w->path = memcpy(w + 1, path, len);
2656   uv__queue_init(&w->watchers);
2657   w->iterating = 0;
2658   RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2659 
2660 no_insert:
2661   uv__handle_start(handle);
2662   uv__queue_insert_tail(&w->watchers, &handle->watchers);
2663   handle->path = w->path;
2664   handle->cb = cb;
2665   handle->wd = wd;
2666 
2667   return 0;
2668 }
2669 
2670 
uv_fs_event_stop(uv_fs_event_t * handle)2671 int uv_fs_event_stop(uv_fs_event_t* handle) {
2672   struct watcher_list* w;
2673 
2674   if (!uv__is_active(handle))
2675     return 0;
2676 
2677   w = find_watcher(handle->loop, handle->wd);
2678   assert(w != NULL);
2679 
2680   handle->wd = -1;
2681   handle->path = NULL;
2682   uv__handle_stop(handle);
2683   uv__queue_remove(&handle->watchers);
2684 
2685   maybe_free_watcher_list(w, handle->loop);
2686 
2687   return 0;
2688 }
2689 
2690 
uv__fs_event_close(uv_fs_event_t * handle)2691 void uv__fs_event_close(uv_fs_event_t* handle) {
2692   uv_fs_event_stop(handle);
2693 }
2694