xref: /libuv/src/unix/linux.c (revision 18d48bc1)
1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2  * Permission is hereby granted, free of charge, to any person obtaining a copy
3  * of this software and associated documentation files (the "Software"), to
4  * deal in the Software without restriction, including without limitation the
5  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6  * sell copies of the Software, and to permit persons to whom the Software is
7  * furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in
10  * all copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18  * IN THE SOFTWARE.
19  */
20 
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22  * EPOLL* counterparts.  We use the POLL* variants in this file because that
23  * is what libuv uses elsewhere.
24  */
25 
26 #include "uv.h"
27 #include "internal.h"
28 
29 #include <inttypes.h>
30 #include <stdatomic.h>
31 #include <stddef.h>  /* offsetof */
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <errno.h>
38 
39 #include <fcntl.h>
40 #include <ifaddrs.h>
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <netpacket/packet.h>
44 #include <sys/epoll.h>
45 #include <sys/inotify.h>
46 #include <sys/mman.h>
47 #include <sys/param.h>
48 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/stat.h>
51 #include <sys/syscall.h>
52 #include <sys/sysinfo.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/utsname.h>
56 #include <time.h>
57 #include <unistd.h>
58 
59 #ifndef __NR_io_uring_setup
60 # define __NR_io_uring_setup 425
61 #endif
62 
63 #ifndef __NR_io_uring_enter
64 # define __NR_io_uring_enter 426
65 #endif
66 
67 #ifndef __NR_io_uring_register
68 # define __NR_io_uring_register 427
69 #endif
70 
71 #ifndef __NR_copy_file_range
72 # if defined(__x86_64__)
73 #  define __NR_copy_file_range 326
74 # elif defined(__i386__)
75 #  define __NR_copy_file_range 377
76 # elif defined(__s390__)
77 #  define __NR_copy_file_range 375
78 # elif defined(__arm__)
79 #  define __NR_copy_file_range 391
80 # elif defined(__aarch64__)
81 #  define __NR_copy_file_range 285
82 # elif defined(__powerpc__)
83 #  define __NR_copy_file_range 379
84 # elif defined(__arc__)
85 #  define __NR_copy_file_range 285
86 # elif defined(__riscv)
87 #  define __NR_copy_file_range 285
88 # endif
89 #endif /* __NR_copy_file_range */
90 
91 #ifndef __NR_statx
92 # if defined(__x86_64__)
93 #  define __NR_statx 332
94 # elif defined(__i386__)
95 #  define __NR_statx 383
96 # elif defined(__aarch64__)
97 #  define __NR_statx 397
98 # elif defined(__arm__)
99 #  define __NR_statx 397
100 # elif defined(__ppc__)
101 #  define __NR_statx 383
102 # elif defined(__s390__)
103 #  define __NR_statx 379
104 # elif defined(__riscv)
105 #  define __NR_statx 291
106 # endif
107 #endif /* __NR_statx */
108 
109 #ifndef __NR_getrandom
110 # if defined(__x86_64__)
111 #  define __NR_getrandom 318
112 # elif defined(__i386__)
113 #  define __NR_getrandom 355
114 # elif defined(__aarch64__)
115 #  define __NR_getrandom 384
116 # elif defined(__arm__)
117 #  define __NR_getrandom 384
118 # elif defined(__ppc__)
119 #  define __NR_getrandom 359
120 # elif defined(__s390__)
121 #  define __NR_getrandom 349
122 # elif defined(__riscv)
123 #  define __NR_getrandom 278
124 # endif
125 #endif /* __NR_getrandom */
126 
127 enum {
128   UV__IORING_SETUP_SQPOLL = 2u,
129   UV__IORING_SETUP_NO_SQARRAY = 0x10000u,
130 };
131 
132 enum {
133   UV__IORING_FEAT_SINGLE_MMAP = 1u,
134   UV__IORING_FEAT_NODROP = 2u,
135   UV__IORING_FEAT_RSRC_TAGS = 1024u,  /* linux v5.13 */
136 };
137 
138 enum {
139   UV__IORING_OP_READV = 1,
140   UV__IORING_OP_WRITEV = 2,
141   UV__IORING_OP_FSYNC = 3,
142   UV__IORING_OP_OPENAT = 18,
143   UV__IORING_OP_CLOSE = 19,
144   UV__IORING_OP_STATX = 21,
145   UV__IORING_OP_EPOLL_CTL = 29,
146   UV__IORING_OP_RENAMEAT = 35,
147   UV__IORING_OP_UNLINKAT = 36,
148   UV__IORING_OP_MKDIRAT = 37,
149   UV__IORING_OP_SYMLINKAT = 38,
150   UV__IORING_OP_LINKAT = 39,
151   UV__IORING_OP_FTRUNCATE = 55,
152 };
153 
154 enum {
155   UV__IORING_ENTER_GETEVENTS = 1u,
156   UV__IORING_ENTER_SQ_WAKEUP = 2u,
157 };
158 
159 enum {
160   UV__IORING_SQ_NEED_WAKEUP = 1u,
161   UV__IORING_SQ_CQ_OVERFLOW = 2u,
162 };
163 
164 struct uv__io_cqring_offsets {
165   uint32_t head;
166   uint32_t tail;
167   uint32_t ring_mask;
168   uint32_t ring_entries;
169   uint32_t overflow;
170   uint32_t cqes;
171   uint64_t reserved0;
172   uint64_t reserved1;
173 };
174 
175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
176 
177 struct uv__io_sqring_offsets {
178   uint32_t head;
179   uint32_t tail;
180   uint32_t ring_mask;
181   uint32_t ring_entries;
182   uint32_t flags;
183   uint32_t dropped;
184   uint32_t array;
185   uint32_t reserved0;
186   uint64_t reserved1;
187 };
188 
189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
190 
191 struct uv__io_uring_cqe {
192   uint64_t user_data;
193   int32_t res;
194   uint32_t flags;
195 };
196 
197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
198 
199 struct uv__io_uring_sqe {
200   uint8_t opcode;
201   uint8_t flags;
202   uint16_t ioprio;
203   int32_t fd;
204   union {
205     uint64_t off;
206     uint64_t addr2;
207   };
208   union {
209     uint64_t addr;
210   };
211   uint32_t len;
212   union {
213     uint32_t rw_flags;
214     uint32_t fsync_flags;
215     uint32_t open_flags;
216     uint32_t statx_flags;
217   };
218   uint64_t user_data;
219   union {
220     uint16_t buf_index;
221     uint64_t pad[3];
222   };
223 };
224 
225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
236 
237 struct uv__io_uring_params {
238   uint32_t sq_entries;
239   uint32_t cq_entries;
240   uint32_t flags;
241   uint32_t sq_thread_cpu;
242   uint32_t sq_thread_idle;
243   uint32_t features;
244   uint32_t reserved[4];
245   struct uv__io_sqring_offsets sq_off;  /* 40 bytes */
246   struct uv__io_cqring_offsets cq_off;  /* 40 bytes */
247 };
248 
249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
252 
253 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
254 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
255 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
256 
257 struct watcher_list {
258   RB_ENTRY(watcher_list) entry;
259   struct uv__queue watchers;
260   int iterating;
261   char* path;
262   int wd;
263 };
264 
265 struct watcher_root {
266   struct watcher_list* rbh_root;
267 };
268 
269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
270 static void uv__inotify_read(uv_loop_t* loop,
271                              uv__io_t* w,
272                              unsigned int revents);
273 static int compare_watchers(const struct watcher_list* a,
274                             const struct watcher_list* b);
275 static void maybe_free_watcher_list(struct watcher_list* w,
276                                     uv_loop_t* loop);
277 
278 static void uv__epoll_ctl_flush(int epollfd,
279                                 struct uv__iou* ctl,
280                                 struct epoll_event (*events)[256]);
281 
282 static void uv__epoll_ctl_prep(int epollfd,
283                                struct uv__iou* ctl,
284                                struct epoll_event (*events)[256],
285                                int op,
286                                int fd,
287                                struct epoll_event* e);
288 
RB_GENERATE_STATIC(watcher_root,watcher_list,entry,compare_watchers)289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
290 
291 
292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
293   /* This cast works because watcher_root is a struct with a pointer as its
294    * sole member. Such type punning is unsafe in the presence of strict
295    * pointer aliasing (and is just plain nasty) but that is why libuv
296    * is compiled with -fno-strict-aliasing.
297    */
298   return (struct watcher_root*) &loop->inotify_watchers;
299 }
300 
301 
uv__kernel_version(void)302 unsigned uv__kernel_version(void) {
303   static _Atomic unsigned cached_version;
304   struct utsname u;
305   unsigned version;
306   unsigned major;
307   unsigned minor;
308   unsigned patch;
309   char v_sig[256];
310   char* needle;
311 
312   version = atomic_load_explicit(&cached_version, memory_order_relaxed);
313   if (version != 0)
314     return version;
315 
316   /* Check /proc/version_signature first as it's the way to get the mainline
317    * kernel version in Ubuntu. The format is:
318    *   Ubuntu ubuntu_kernel_version mainline_kernel_version
319    * For example:
320    *   Ubuntu 5.15.0-79.86-generic 5.15.111
321    */
322   if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
323     if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
324       goto calculate_version;
325 
326   if (-1 == uname(&u))
327     return 0;
328 
329   /* In Debian we need to check `version` instead of `release` to extract the
330    * mainline kernel version. This is an example of how it looks like:
331    *  #1 SMP Debian 5.10.46-4 (2021-08-03)
332    */
333   needle = strstr(u.version, "Debian ");
334   if (needle != NULL)
335     if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
336       goto calculate_version;
337 
338   if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
339     return 0;
340 
341   /* Handle it when the process runs under the UNAME26 personality:
342    *
343    * - kernels >= 3.x identify as 2.6.40+x
344    * - kernels >= 4.x identify as 2.6.60+x
345    *
346    * UNAME26 is a poorly conceived hack that doesn't let us distinguish
347    * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
348    * that 2.6.60+x means 4.x.
349    *
350    * Fun fact of the day: it's technically possible to observe the actual
351    * kernel version for a brief moment because uname() first copies out the
352    * real release string before overwriting it with the backcompat string.
353    */
354   if (major == 2 && minor == 6) {
355     if (patch >= 60) {
356       major = 4;
357       minor = patch - 60;
358       patch = 0;
359     } else if (patch >= 40) {
360       major = 3;
361       minor = patch - 40;
362       patch = 0;
363     }
364   }
365 
366 calculate_version:
367   version = major * 65536 + minor * 256 + patch;
368   atomic_store_explicit(&cached_version, version, memory_order_relaxed);
369 
370   return version;
371 }
372 
373 
374 ssize_t
uv__fs_copy_file_range(int fd_in,off_t * off_in,int fd_out,off_t * off_out,size_t len,unsigned int flags)375 uv__fs_copy_file_range(int fd_in,
376                        off_t* off_in,
377                        int fd_out,
378                        off_t* off_out,
379                        size_t len,
380                        unsigned int flags)
381 {
382 #ifdef __NR_copy_file_range
383   return syscall(__NR_copy_file_range,
384                  fd_in,
385                  off_in,
386                  fd_out,
387                  off_out,
388                  len,
389                  flags);
390 #else
391   return errno = ENOSYS, -1;
392 #endif
393 }
394 
395 
uv__statx(int dirfd,const char * path,int flags,unsigned int mask,struct uv__statx * statxbuf)396 int uv__statx(int dirfd,
397               const char* path,
398               int flags,
399               unsigned int mask,
400               struct uv__statx* statxbuf) {
401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
402   return errno = ENOSYS, -1;
403 #else
404   int rc;
405 
406   rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
407   if (rc >= 0)
408     uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
409 
410   return rc;
411 #endif
412 }
413 
414 
uv__getrandom(void * buf,size_t buflen,unsigned flags)415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
417   return errno = ENOSYS, -1;
418 #else
419   ssize_t rc;
420 
421   rc = syscall(__NR_getrandom, buf, buflen, flags);
422   if (rc >= 0)
423     uv__msan_unpoison(buf, buflen);
424 
425   return rc;
426 #endif
427 }
428 
429 
uv__io_uring_setup(int entries,struct uv__io_uring_params * params)430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
431   return syscall(__NR_io_uring_setup, entries, params);
432 }
433 
434 
uv__io_uring_enter(int fd,unsigned to_submit,unsigned min_complete,unsigned flags)435 int uv__io_uring_enter(int fd,
436                        unsigned to_submit,
437                        unsigned min_complete,
438                        unsigned flags) {
439   /* io_uring_enter used to take a sigset_t but it's unused
440    * in newer kernels unless IORING_ENTER_EXT_ARG is set,
441    * in which case it takes a struct io_uring_getevents_arg.
442    */
443   return syscall(__NR_io_uring_enter,
444                  fd,
445                  to_submit,
446                  min_complete,
447                  flags,
448                  NULL,
449                  0L);
450 }
451 
452 
uv__io_uring_register(int fd,unsigned opcode,void * arg,unsigned nargs)453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
454   return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
455 }
456 
457 
uv__use_io_uring(void)458 static int uv__use_io_uring(void) {
459 #if defined(__ANDROID_API__)
460   return 0;  /* Possibly available but blocked by seccomp. */
461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
462   /* See https://github.com/libuv/libuv/issues/4158. */
463   return 0;  /* All 32 bits kernels appear buggy. */
464 #elif defined(__powerpc64__) || defined(__ppc64__)
465   /* See https://github.com/libuv/libuv/issues/4283. */
466   return 0; /* Random SIGSEGV in signal handler. */
467 #else
468   /* Ternary: unknown=0, yes=1, no=-1 */
469   static _Atomic int use_io_uring;
470   char* val;
471   int use;
472 
473   use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
474 
475   if (use == 0) {
476     use = uv__kernel_version() >=
477 #if defined(__hppa__)
478     /* io_uring first supported on parisc in 6.1, functional in .51 */
479     /* https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/ */
480     /* 6.1.51 */ 0x060133
481 #else
482     /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
483     /* 5.10.186 */ 0x050ABA
484 #endif
485     ? 1 : -1;
486 
487     /* But users can still enable it if they so desire. */
488     val = getenv("UV_USE_IO_URING");
489     if (val != NULL)
490       use = atoi(val) ? 1 : -1;
491 
492     atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
493   }
494 
495   return use > 0;
496 #endif
497 }
498 
499 
uv__iou_init(int epollfd,struct uv__iou * iou,uint32_t entries,uint32_t flags)500 static void uv__iou_init(int epollfd,
501                          struct uv__iou* iou,
502                          uint32_t entries,
503                          uint32_t flags) {
504   struct uv__io_uring_params params;
505   struct epoll_event e;
506   size_t cqlen;
507   size_t sqlen;
508   size_t maxlen;
509   size_t sqelen;
510   unsigned kernel_version;
511   uint32_t* sqarray;
512   uint32_t i;
513   char* sq;
514   char* sqe;
515   int ringfd;
516   int no_sqarray;
517 
518   sq = MAP_FAILED;
519   sqe = MAP_FAILED;
520 
521   if (!uv__use_io_uring())
522     return;
523 
524   kernel_version = uv__kernel_version();
525   no_sqarray =
526       UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600);
527 
528   /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
529    * Mostly academic because we check for a v5.13 kernel afterwards anyway.
530    */
531   memset(&params, 0, sizeof(params));
532   params.flags = flags | no_sqarray;
533 
534   if (flags & UV__IORING_SETUP_SQPOLL)
535     params.sq_thread_idle = 10;  /* milliseconds */
536 
537   /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
538   ringfd = uv__io_uring_setup(entries, &params);
539   if (ringfd == -1)
540     return;
541 
542   /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
543    * actually detecting is whether IORING_OP_STATX works with SQPOLL.
544    */
545   if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
546     goto fail;
547 
548   /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
549   if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
550     goto fail;
551 
552   /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
553   if (!(params.features & UV__IORING_FEAT_NODROP))
554     goto fail;
555 
556   sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
557   cqlen =
558       params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
559   maxlen = sqlen < cqlen ? cqlen : sqlen;
560   sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
561 
562   sq = mmap(0,
563             maxlen,
564             PROT_READ | PROT_WRITE,
565             MAP_SHARED | MAP_POPULATE,
566             ringfd,
567             0);  /* IORING_OFF_SQ_RING */
568 
569   sqe = mmap(0,
570              sqelen,
571              PROT_READ | PROT_WRITE,
572              MAP_SHARED | MAP_POPULATE,
573              ringfd,
574              0x10000000ull);  /* IORING_OFF_SQES */
575 
576   if (sq == MAP_FAILED || sqe == MAP_FAILED)
577     goto fail;
578 
579   if (flags & UV__IORING_SETUP_SQPOLL) {
580     /* Only interested in completion events. To get notified when
581      * the kernel pulls items from the submission ring, add POLLOUT.
582      */
583     memset(&e, 0, sizeof(e));
584     e.events = POLLIN;
585     e.data.fd = ringfd;
586 
587     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
588       goto fail;
589   }
590 
591   iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
592   iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
593   iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
594   iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
595   iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
596   iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
597   iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
598   iou->sq = sq;
599   iou->cqe = sq + params.cq_off.cqes;
600   iou->sqe = sqe;
601   iou->sqlen = sqlen;
602   iou->cqlen = cqlen;
603   iou->maxlen = maxlen;
604   iou->sqelen = sqelen;
605   iou->ringfd = ringfd;
606   iou->in_flight = 0;
607 
608   if (no_sqarray)
609     return;
610 
611   sqarray = (uint32_t*) (sq + params.sq_off.array);
612   for (i = 0; i <= iou->sqmask; i++)
613     sqarray[i] = i;  /* Slot -> sqe identity mapping. */
614 
615   return;
616 
617 fail:
618   if (sq != MAP_FAILED)
619     munmap(sq, maxlen);
620 
621   if (sqe != MAP_FAILED)
622     munmap(sqe, sqelen);
623 
624   uv__close(ringfd);
625 }
626 
627 
uv__iou_delete(struct uv__iou * iou)628 static void uv__iou_delete(struct uv__iou* iou) {
629   if (iou->ringfd > -1) {
630     munmap(iou->sq, iou->maxlen);
631     munmap(iou->sqe, iou->sqelen);
632     uv__close(iou->ringfd);
633     iou->ringfd = -1;
634   }
635 }
636 
637 
uv__platform_loop_init(uv_loop_t * loop)638 int uv__platform_loop_init(uv_loop_t* loop) {
639   uv__loop_internal_fields_t* lfields;
640 
641   lfields = uv__get_internal_fields(loop);
642   lfields->ctl.ringfd = -1;
643   lfields->iou.ringfd = -2;  /* "uninitialized" */
644 
645   loop->inotify_watchers = NULL;
646   loop->inotify_fd = -1;
647   loop->backend_fd = epoll_create1(O_CLOEXEC);
648 
649   if (loop->backend_fd == -1)
650     return UV__ERR(errno);
651 
652   uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
653 
654   return 0;
655 }
656 
657 
uv__io_fork(uv_loop_t * loop)658 int uv__io_fork(uv_loop_t* loop) {
659   int err;
660   struct watcher_list* root;
661 
662   root = uv__inotify_watchers(loop)->rbh_root;
663 
664   uv__close(loop->backend_fd);
665   loop->backend_fd = -1;
666 
667   /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
668   uv__platform_loop_delete(loop);
669 
670   err = uv__platform_loop_init(loop);
671   if (err)
672     return err;
673 
674   return uv__inotify_fork(loop, root);
675 }
676 
677 
uv__platform_loop_delete(uv_loop_t * loop)678 void uv__platform_loop_delete(uv_loop_t* loop) {
679   uv__loop_internal_fields_t* lfields;
680 
681   lfields = uv__get_internal_fields(loop);
682   uv__iou_delete(&lfields->ctl);
683   uv__iou_delete(&lfields->iou);
684 
685   if (loop->inotify_fd != -1) {
686     uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
687     uv__close(loop->inotify_fd);
688     loop->inotify_fd = -1;
689   }
690 }
691 
692 
693 struct uv__invalidate {
694   struct epoll_event (*prep)[256];
695   struct epoll_event* events;
696   int nfds;
697 };
698 
699 
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)700 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
701   uv__loop_internal_fields_t* lfields;
702   struct uv__invalidate* inv;
703   struct epoll_event dummy;
704   int i;
705 
706   lfields = uv__get_internal_fields(loop);
707   inv = lfields->inv;
708 
709   /* Invalidate events with same file descriptor */
710   if (inv != NULL)
711     for (i = 0; i < inv->nfds; i++)
712       if (inv->events[i].data.fd == fd)
713         inv->events[i].data.fd = -1;
714 
715   /* Remove the file descriptor from the epoll.
716    * This avoids a problem where the same file description remains open
717    * in another process, causing repeated junk epoll events.
718    *
719    * Perform EPOLL_CTL_DEL immediately instead of going through
720    * io_uring's submit queue, otherwise the file descriptor may
721    * be closed by the time the kernel starts the operation.
722    *
723    * We pass in a dummy epoll_event, to work around a bug in old kernels.
724    *
725    * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
726    * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
727    */
728   memset(&dummy, 0, sizeof(dummy));
729   epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
730 }
731 
732 
uv__io_check_fd(uv_loop_t * loop,int fd)733 int uv__io_check_fd(uv_loop_t* loop, int fd) {
734   struct epoll_event e;
735   int rc;
736 
737   memset(&e, 0, sizeof(e));
738   e.events = POLLIN;
739   e.data.fd = -1;
740 
741   rc = 0;
742   if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
743     if (errno != EEXIST)
744       rc = UV__ERR(errno);
745 
746   if (rc == 0)
747     if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
748       abort();
749 
750   return rc;
751 }
752 
753 
754 /* Caller must initialize SQE and call uv__iou_submit(). */
uv__iou_get_sqe(struct uv__iou * iou,uv_loop_t * loop,uv_fs_t * req)755 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
756                                                 uv_loop_t* loop,
757                                                 uv_fs_t* req) {
758   struct uv__io_uring_sqe* sqe;
759   uint32_t head;
760   uint32_t tail;
761   uint32_t mask;
762   uint32_t slot;
763 
764   /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
765    * initialization failed. Anything else is a valid ring file descriptor.
766    */
767   if (iou->ringfd == -2) {
768     /* By default, the SQPOLL is not created. Enable only if the loop is
769      * configured with UV_LOOP_USE_IO_URING_SQPOLL.
770      */
771     if ((loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL) == 0) {
772       iou->ringfd = -1;
773       return NULL;
774     }
775 
776     uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
777     if (iou->ringfd == -2)
778       iou->ringfd = -1;  /* "failed" */
779   }
780 
781   if (iou->ringfd == -1)
782     return NULL;
783 
784   head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
785                               memory_order_acquire);
786   tail = *iou->sqtail;
787   mask = iou->sqmask;
788 
789   if ((head & mask) == ((tail + 1) & mask))
790     return NULL;  /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
791 
792   slot = tail & mask;
793   sqe = iou->sqe;
794   sqe = &sqe[slot];
795   memset(sqe, 0, sizeof(*sqe));
796   sqe->user_data = (uintptr_t) req;
797 
798   /* Pacify uv_cancel(). */
799   req->work_req.loop = loop;
800   req->work_req.work = NULL;
801   req->work_req.done = NULL;
802   uv__queue_init(&req->work_req.wq);
803 
804   uv__req_register(loop);
805   iou->in_flight++;
806 
807   return sqe;
808 }
809 
810 
uv__iou_submit(struct uv__iou * iou)811 static void uv__iou_submit(struct uv__iou* iou) {
812   uint32_t flags;
813 
814   atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
815                         *iou->sqtail + 1,
816                         memory_order_release);
817 
818   flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
819                                memory_order_acquire);
820 
821   if (flags & UV__IORING_SQ_NEED_WAKEUP)
822     if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
823       if (errno != EOWNERDEAD)  /* Kernel bug. Harmless, ignore. */
824         perror("libuv: io_uring_enter(wakeup)");  /* Can't happen. */
825 }
826 
827 
uv__iou_fs_close(uv_loop_t * loop,uv_fs_t * req)828 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
829   struct uv__io_uring_sqe* sqe;
830   struct uv__iou* iou;
831   int kv;
832 
833   kv = uv__kernel_version();
834   /* Work around a poorly understood bug in older kernels where closing a file
835    * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
836    * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
837    * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
838    * but good candidates are the several data race fixes. Interestingly, it
839    * seems to manifest only when running under Docker so the possibility of
840    * a Docker bug can't be completely ruled out either. Yay, computers.
841    * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
842    * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
843    * solved.
844    */
845   if (kv < /* 5.15.90 */ 0x050F5A)
846     return 0;
847 
848   if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
849     return 0;
850 
851 
852   iou = &uv__get_internal_fields(loop)->iou;
853 
854   sqe = uv__iou_get_sqe(iou, loop, req);
855   if (sqe == NULL)
856     return 0;
857 
858   sqe->fd = req->file;
859   sqe->opcode = UV__IORING_OP_CLOSE;
860 
861   uv__iou_submit(iou);
862 
863   return 1;
864 }
865 
866 
uv__iou_fs_ftruncate(uv_loop_t * loop,uv_fs_t * req)867 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) {
868   struct uv__io_uring_sqe* sqe;
869   struct uv__iou* iou;
870 
871   if (uv__kernel_version() < /* 6.9 */0x060900)
872     return 0;
873 
874   iou = &uv__get_internal_fields(loop)->iou;
875   sqe = uv__iou_get_sqe(iou, loop, req);
876   if (sqe == NULL)
877     return 0;
878 
879   sqe->fd = req->file;
880   sqe->len = req->off;
881   sqe->opcode = UV__IORING_OP_FTRUNCATE;
882   uv__iou_submit(iou);
883 
884   return 1;
885 }
886 
uv__iou_fs_fsync_or_fdatasync(uv_loop_t * loop,uv_fs_t * req,uint32_t fsync_flags)887 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
888                                   uv_fs_t* req,
889                                   uint32_t fsync_flags) {
890   struct uv__io_uring_sqe* sqe;
891   struct uv__iou* iou;
892 
893   iou = &uv__get_internal_fields(loop)->iou;
894 
895   sqe = uv__iou_get_sqe(iou, loop, req);
896   if (sqe == NULL)
897     return 0;
898 
899   /* Little known fact: setting seq->off and seq->len turns
900    * it into an asynchronous sync_file_range() operation.
901    */
902   sqe->fd = req->file;
903   sqe->fsync_flags = fsync_flags;
904   sqe->opcode = UV__IORING_OP_FSYNC;
905 
906   uv__iou_submit(iou);
907 
908   return 1;
909 }
910 
911 
uv__iou_fs_link(uv_loop_t * loop,uv_fs_t * req)912 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
913   struct uv__io_uring_sqe* sqe;
914   struct uv__iou* iou;
915 
916   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
917     return 0;
918 
919   iou = &uv__get_internal_fields(loop)->iou;
920   sqe = uv__iou_get_sqe(iou, loop, req);
921   if (sqe == NULL)
922     return 0;
923 
924   sqe->addr = (uintptr_t) req->path;
925   sqe->fd = AT_FDCWD;
926   sqe->addr2 = (uintptr_t) req->new_path;
927   sqe->len = AT_FDCWD;
928   sqe->opcode = UV__IORING_OP_LINKAT;
929 
930   uv__iou_submit(iou);
931 
932   return 1;
933 }
934 
935 
uv__iou_fs_mkdir(uv_loop_t * loop,uv_fs_t * req)936 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
937   struct uv__io_uring_sqe* sqe;
938   struct uv__iou* iou;
939 
940   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
941     return 0;
942 
943   iou = &uv__get_internal_fields(loop)->iou;
944   sqe = uv__iou_get_sqe(iou, loop, req);
945   if (sqe == NULL)
946     return 0;
947 
948   sqe->addr = (uintptr_t) req->path;
949   sqe->fd = AT_FDCWD;
950   sqe->len = req->mode;
951   sqe->opcode = UV__IORING_OP_MKDIRAT;
952 
953   uv__iou_submit(iou);
954 
955   return 1;
956 }
957 
958 
uv__iou_fs_open(uv_loop_t * loop,uv_fs_t * req)959 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
960   struct uv__io_uring_sqe* sqe;
961   struct uv__iou* iou;
962 
963   iou = &uv__get_internal_fields(loop)->iou;
964 
965   sqe = uv__iou_get_sqe(iou, loop, req);
966   if (sqe == NULL)
967     return 0;
968 
969   sqe->addr = (uintptr_t) req->path;
970   sqe->fd = AT_FDCWD;
971   sqe->len = req->mode;
972   sqe->opcode = UV__IORING_OP_OPENAT;
973   sqe->open_flags = req->flags | O_CLOEXEC;
974 
975   uv__iou_submit(iou);
976 
977   return 1;
978 }
979 
980 
uv__iou_fs_rename(uv_loop_t * loop,uv_fs_t * req)981 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
982   struct uv__io_uring_sqe* sqe;
983   struct uv__iou* iou;
984 
985   iou = &uv__get_internal_fields(loop)->iou;
986 
987   sqe = uv__iou_get_sqe(iou, loop, req);
988   if (sqe == NULL)
989     return 0;
990 
991   sqe->addr = (uintptr_t) req->path;
992   sqe->fd = AT_FDCWD;
993   sqe->addr2 = (uintptr_t) req->new_path;
994   sqe->len = AT_FDCWD;
995   sqe->opcode = UV__IORING_OP_RENAMEAT;
996 
997   uv__iou_submit(iou);
998 
999   return 1;
1000 }
1001 
1002 
uv__iou_fs_symlink(uv_loop_t * loop,uv_fs_t * req)1003 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
1004   struct uv__io_uring_sqe* sqe;
1005   struct uv__iou* iou;
1006 
1007   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
1008     return 0;
1009 
1010   iou = &uv__get_internal_fields(loop)->iou;
1011   sqe = uv__iou_get_sqe(iou, loop, req);
1012   if (sqe == NULL)
1013     return 0;
1014 
1015   sqe->addr = (uintptr_t) req->path;
1016   sqe->fd = AT_FDCWD;
1017   sqe->addr2 = (uintptr_t) req->new_path;
1018   sqe->opcode = UV__IORING_OP_SYMLINKAT;
1019 
1020   uv__iou_submit(iou);
1021 
1022   return 1;
1023 }
1024 
1025 
uv__iou_fs_unlink(uv_loop_t * loop,uv_fs_t * req)1026 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
1027   struct uv__io_uring_sqe* sqe;
1028   struct uv__iou* iou;
1029 
1030   iou = &uv__get_internal_fields(loop)->iou;
1031 
1032   sqe = uv__iou_get_sqe(iou, loop, req);
1033   if (sqe == NULL)
1034     return 0;
1035 
1036   sqe->addr = (uintptr_t) req->path;
1037   sqe->fd = AT_FDCWD;
1038   sqe->opcode = UV__IORING_OP_UNLINKAT;
1039 
1040   uv__iou_submit(iou);
1041 
1042   return 1;
1043 }
1044 
1045 
uv__iou_fs_read_or_write(uv_loop_t * loop,uv_fs_t * req,int is_read)1046 int uv__iou_fs_read_or_write(uv_loop_t* loop,
1047                              uv_fs_t* req,
1048                              int is_read) {
1049   struct uv__io_uring_sqe* sqe;
1050   struct uv__iou* iou;
1051 
1052   /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1053    * to the threadpool on writes */
1054   if (req->nbufs > IOV_MAX) {
1055     if (is_read)
1056       req->nbufs = IOV_MAX;
1057     else
1058       return 0;
1059   }
1060 
1061   iou = &uv__get_internal_fields(loop)->iou;
1062 
1063   sqe = uv__iou_get_sqe(iou, loop, req);
1064   if (sqe == NULL)
1065     return 0;
1066 
1067   sqe->addr = (uintptr_t) req->bufs;
1068   sqe->fd = req->file;
1069   sqe->len = req->nbufs;
1070   sqe->off = req->off < 0 ? -1 : req->off;
1071   sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1072 
1073   uv__iou_submit(iou);
1074 
1075   return 1;
1076 }
1077 
1078 
uv__iou_fs_statx(uv_loop_t * loop,uv_fs_t * req,int is_fstat,int is_lstat)1079 int uv__iou_fs_statx(uv_loop_t* loop,
1080                      uv_fs_t* req,
1081                      int is_fstat,
1082                      int is_lstat) {
1083   struct uv__io_uring_sqe* sqe;
1084   struct uv__statx* statxbuf;
1085   struct uv__iou* iou;
1086 
1087   statxbuf = uv__malloc(sizeof(*statxbuf));
1088   if (statxbuf == NULL)
1089     return 0;
1090 
1091   iou = &uv__get_internal_fields(loop)->iou;
1092 
1093   sqe = uv__iou_get_sqe(iou, loop, req);
1094   if (sqe == NULL) {
1095     uv__free(statxbuf);
1096     return 0;
1097   }
1098 
1099   req->ptr = statxbuf;
1100 
1101   sqe->addr = (uintptr_t) req->path;
1102   sqe->addr2 = (uintptr_t) statxbuf;
1103   sqe->fd = AT_FDCWD;
1104   sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1105   sqe->opcode = UV__IORING_OP_STATX;
1106 
1107   if (is_fstat) {
1108     sqe->addr = (uintptr_t) "";
1109     sqe->fd = req->file;
1110     sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1111   }
1112 
1113   if (is_lstat)
1114     sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1115 
1116   uv__iou_submit(iou);
1117 
1118   return 1;
1119 }
1120 
1121 
uv__statx_to_stat(const struct uv__statx * statxbuf,uv_stat_t * buf)1122 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1123   buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1124   buf->st_mode = statxbuf->stx_mode;
1125   buf->st_nlink = statxbuf->stx_nlink;
1126   buf->st_uid = statxbuf->stx_uid;
1127   buf->st_gid = statxbuf->stx_gid;
1128   buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1129   buf->st_ino = statxbuf->stx_ino;
1130   buf->st_size = statxbuf->stx_size;
1131   buf->st_blksize = statxbuf->stx_blksize;
1132   buf->st_blocks = statxbuf->stx_blocks;
1133   buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1134   buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1135   buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1136   buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1137   buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1138   buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1139   buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1140   buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1141   buf->st_flags = 0;
1142   buf->st_gen = 0;
1143 }
1144 
1145 
uv__iou_fs_statx_post(uv_fs_t * req)1146 static void uv__iou_fs_statx_post(uv_fs_t* req) {
1147   struct uv__statx* statxbuf;
1148   uv_stat_t* buf;
1149 
1150   buf = &req->statbuf;
1151   statxbuf = req->ptr;
1152   req->ptr = NULL;
1153 
1154   if (req->result == 0) {
1155     uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1156     uv__statx_to_stat(statxbuf, buf);
1157     req->ptr = buf;
1158   }
1159 
1160   uv__free(statxbuf);
1161 }
1162 
1163 
uv__poll_io_uring(uv_loop_t * loop,struct uv__iou * iou)1164 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1165   struct uv__io_uring_cqe* cqe;
1166   struct uv__io_uring_cqe* e;
1167   uv_fs_t* req;
1168   uint32_t head;
1169   uint32_t tail;
1170   uint32_t mask;
1171   uint32_t i;
1172   uint32_t flags;
1173   int nevents;
1174   int rc;
1175 
1176   head = *iou->cqhead;
1177   tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1178                               memory_order_acquire);
1179   mask = iou->cqmask;
1180   cqe = iou->cqe;
1181   nevents = 0;
1182 
1183   for (i = head; i != tail; i++) {
1184     e = &cqe[i & mask];
1185 
1186     req = (uv_fs_t*) (uintptr_t) e->user_data;
1187     assert(req->type == UV_FS);
1188 
1189     uv__req_unregister(loop);
1190     iou->in_flight--;
1191 
1192     /* If the op is not supported by the kernel retry using the thread pool */
1193     if (e->res == -EOPNOTSUPP) {
1194       uv__fs_post(loop, req);
1195       continue;
1196     }
1197 
1198     /* io_uring stores error codes as negative numbers, same as libuv. */
1199     req->result = e->res;
1200 
1201     switch (req->fs_type) {
1202       case UV_FS_FSTAT:
1203       case UV_FS_LSTAT:
1204       case UV_FS_STAT:
1205         uv__iou_fs_statx_post(req);
1206         break;
1207       default:  /* Squelch -Wswitch warnings. */
1208         break;
1209     }
1210 
1211     uv__metrics_update_idle_time(loop);
1212     req->cb(req);
1213     nevents++;
1214   }
1215 
1216   atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1217                         tail,
1218                         memory_order_release);
1219 
1220   /* Check whether CQE's overflowed, if so enter the kernel to make them
1221    * available. Don't grab them immediately but in the next loop iteration to
1222    * avoid loop starvation. */
1223   flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1224                                memory_order_acquire);
1225 
1226   if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1227     do
1228       rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1229     while (rc == -1 && errno == EINTR);
1230 
1231     if (rc < 0)
1232       perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
1233   }
1234 
1235   uv__metrics_inc_events(loop, nevents);
1236   if (uv__get_internal_fields(loop)->current_timeout == 0)
1237     uv__metrics_inc_events_waiting(loop, nevents);
1238 }
1239 
1240 
1241 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1242  * executed immediately, otherwise the file descriptor may have been closed
1243  * by the time the kernel starts the operation.
1244  */
uv__epoll_ctl_prep(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256],int op,int fd,struct epoll_event * e)1245 static void uv__epoll_ctl_prep(int epollfd,
1246                                struct uv__iou* ctl,
1247                                struct epoll_event (*events)[256],
1248                                int op,
1249                                int fd,
1250                                struct epoll_event* e) {
1251   struct uv__io_uring_sqe* sqe;
1252   struct epoll_event* pe;
1253   uint32_t mask;
1254   uint32_t slot;
1255 
1256   assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1257   assert(ctl->ringfd != -1);
1258 
1259   mask = ctl->sqmask;
1260   slot = (*ctl->sqtail)++ & mask;
1261 
1262   pe = &(*events)[slot];
1263   *pe = *e;
1264 
1265   sqe = ctl->sqe;
1266   sqe = &sqe[slot];
1267 
1268   memset(sqe, 0, sizeof(*sqe));
1269   sqe->addr = (uintptr_t) pe;
1270   sqe->fd = epollfd;
1271   sqe->len = op;
1272   sqe->off = fd;
1273   sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1274   sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1275 
1276   if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1277     uv__epoll_ctl_flush(epollfd, ctl, events);
1278 }
1279 
1280 
uv__epoll_ctl_flush(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256])1281 static void uv__epoll_ctl_flush(int epollfd,
1282                                 struct uv__iou* ctl,
1283                                 struct epoll_event (*events)[256]) {
1284   struct epoll_event oldevents[256];
1285   struct uv__io_uring_cqe* cqe;
1286   uint32_t oldslot;
1287   uint32_t slot;
1288   uint32_t n;
1289   int fd;
1290   int op;
1291   int rc;
1292 
1293   STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1294   assert(ctl->ringfd != -1);
1295   assert(*ctl->sqhead != *ctl->sqtail);
1296 
1297   n = *ctl->sqtail - *ctl->sqhead;
1298   do
1299     rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1300   while (rc == -1 && errno == EINTR);
1301 
1302   if (rc < 0)
1303     perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
1304 
1305   if (rc != (int) n)
1306     abort();
1307 
1308   assert(*ctl->sqhead == *ctl->sqtail);
1309 
1310   memcpy(oldevents, *events, sizeof(*events));
1311 
1312   /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1313    * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1314    * that we are already watching. Ignore the former and retry the latter
1315    * with EPOLL_CTL_MOD.
1316    */
1317   while (*ctl->cqhead != *ctl->cqtail) {
1318     slot = (*ctl->cqhead)++ & ctl->cqmask;
1319 
1320     cqe = ctl->cqe;
1321     cqe = &cqe[slot];
1322 
1323     if (cqe->res == 0)
1324       continue;
1325 
1326     fd = cqe->user_data >> 32;
1327     op = 3 & cqe->user_data;
1328     oldslot = 255 & (cqe->user_data >> 2);
1329 
1330     if (op == EPOLL_CTL_DEL)
1331       continue;
1332 
1333     if (op != EPOLL_CTL_ADD)
1334       abort();
1335 
1336     if (cqe->res != -EEXIST)
1337       abort();
1338 
1339     uv__epoll_ctl_prep(epollfd,
1340                        ctl,
1341                        events,
1342                        EPOLL_CTL_MOD,
1343                        fd,
1344                        &oldevents[oldslot]);
1345   }
1346 }
1347 
1348 
uv__io_poll(uv_loop_t * loop,int timeout)1349 void uv__io_poll(uv_loop_t* loop, int timeout) {
1350   uv__loop_internal_fields_t* lfields;
1351   struct epoll_event events[1024];
1352   struct epoll_event prep[256];
1353   struct uv__invalidate inv;
1354   struct epoll_event* pe;
1355   struct epoll_event e;
1356   struct uv__iou* ctl;
1357   struct uv__iou* iou;
1358   int real_timeout;
1359   struct uv__queue* q;
1360   uv__io_t* w;
1361   sigset_t* sigmask;
1362   sigset_t sigset;
1363   uint64_t base;
1364   int have_iou_events;
1365   int have_signals;
1366   int nevents;
1367   int epollfd;
1368   int count;
1369   int nfds;
1370   int fd;
1371   int op;
1372   int i;
1373   int user_timeout;
1374   int reset_timeout;
1375 
1376   lfields = uv__get_internal_fields(loop);
1377   ctl = &lfields->ctl;
1378   iou = &lfields->iou;
1379 
1380   sigmask = NULL;
1381   if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1382     sigemptyset(&sigset);
1383     sigaddset(&sigset, SIGPROF);
1384     sigmask = &sigset;
1385   }
1386 
1387   assert(timeout >= -1);
1388   base = loop->time;
1389   count = 48; /* Benchmarks suggest this gives the best throughput. */
1390   real_timeout = timeout;
1391 
1392   if (lfields->flags & UV_METRICS_IDLE_TIME) {
1393     reset_timeout = 1;
1394     user_timeout = timeout;
1395     timeout = 0;
1396   } else {
1397     reset_timeout = 0;
1398     user_timeout = 0;
1399   }
1400 
1401   epollfd = loop->backend_fd;
1402 
1403   memset(&e, 0, sizeof(e));
1404 
1405   while (!uv__queue_empty(&loop->watcher_queue)) {
1406     q = uv__queue_head(&loop->watcher_queue);
1407     w = uv__queue_data(q, uv__io_t, watcher_queue);
1408     uv__queue_remove(q);
1409     uv__queue_init(q);
1410 
1411     op = EPOLL_CTL_MOD;
1412     if (w->events == 0)
1413       op = EPOLL_CTL_ADD;
1414 
1415     w->events = w->pevents;
1416     e.events = w->pevents;
1417     e.data.fd = w->fd;
1418     fd = w->fd;
1419 
1420     if (ctl->ringfd != -1) {
1421       uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1422       continue;
1423     }
1424 
1425     if (!epoll_ctl(epollfd, op, fd, &e))
1426       continue;
1427 
1428     assert(op == EPOLL_CTL_ADD);
1429     assert(errno == EEXIST);
1430 
1431     /* File descriptor that's been watched before, update event mask. */
1432     if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1433       abort();
1434   }
1435 
1436   inv.events = events;
1437   inv.prep = &prep;
1438   inv.nfds = -1;
1439 
1440   for (;;) {
1441     if (loop->nfds == 0)
1442       if (iou->in_flight == 0)
1443         break;
1444 
1445     /* All event mask mutations should be visible to the kernel before
1446      * we enter epoll_pwait().
1447      */
1448     if (ctl->ringfd != -1)
1449       while (*ctl->sqhead != *ctl->sqtail)
1450         uv__epoll_ctl_flush(epollfd, ctl, &prep);
1451 
1452     /* Only need to set the provider_entry_time if timeout != 0. The function
1453      * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
1454      */
1455     if (timeout != 0)
1456       uv__metrics_set_provider_entry_time(loop);
1457 
1458     /* Store the current timeout in a location that's globally accessible so
1459      * other locations like uv__work_done() can determine whether the queue
1460      * of events in the callback were waiting when poll was called.
1461      */
1462     lfields->current_timeout = timeout;
1463 
1464     nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1465 
1466     /* Update loop->time unconditionally. It's tempting to skip the update when
1467      * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
1468      * operating system didn't reschedule our process while in the syscall.
1469      */
1470     SAVE_ERRNO(uv__update_time(loop));
1471 
1472     if (nfds == -1)
1473       assert(errno == EINTR);
1474     else if (nfds == 0)
1475       /* Unlimited timeout should only return with events or signal. */
1476       assert(timeout != -1);
1477 
1478     if (nfds == 0 || nfds == -1) {
1479       if (reset_timeout != 0) {
1480         timeout = user_timeout;
1481         reset_timeout = 0;
1482       } else if (nfds == 0) {
1483         return;
1484       }
1485 
1486       /* Interrupted by a signal. Update timeout and poll again. */
1487       goto update_timeout;
1488     }
1489 
1490     have_iou_events = 0;
1491     have_signals = 0;
1492     nevents = 0;
1493 
1494     inv.nfds = nfds;
1495     lfields->inv = &inv;
1496 
1497     for (i = 0; i < nfds; i++) {
1498       pe = events + i;
1499       fd = pe->data.fd;
1500 
1501       /* Skip invalidated events, see uv__platform_invalidate_fd */
1502       if (fd == -1)
1503         continue;
1504 
1505       if (fd == iou->ringfd) {
1506         uv__poll_io_uring(loop, iou);
1507         have_iou_events = 1;
1508         continue;
1509       }
1510 
1511       assert(fd >= 0);
1512       assert((unsigned) fd < loop->nwatchers);
1513 
1514       w = loop->watchers[fd];
1515 
1516       if (w == NULL) {
1517         /* File descriptor that we've stopped watching, disarm it.
1518          *
1519          * Ignore all errors because we may be racing with another thread
1520          * when the file descriptor is closed.
1521          *
1522          * Perform EPOLL_CTL_DEL immediately instead of going through
1523          * io_uring's submit queue, otherwise the file descriptor may
1524          * be closed by the time the kernel starts the operation.
1525          */
1526         epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1527         continue;
1528       }
1529 
1530       /* Give users only events they're interested in. Prevents spurious
1531        * callbacks when previous callback invocation in this loop has stopped
1532        * the current watcher. Also, filters out events that users has not
1533        * requested us to watch.
1534        */
1535       pe->events &= w->pevents | POLLERR | POLLHUP;
1536 
1537       /* Work around an epoll quirk where it sometimes reports just the
1538        * EPOLLERR or EPOLLHUP event.  In order to force the event loop to
1539        * move forward, we merge in the read/write events that the watcher
1540        * is interested in; uv__read() and uv__write() will then deal with
1541        * the error or hangup in the usual fashion.
1542        *
1543        * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1544        * reads the available data, calls uv_read_stop(), then sometime later
1545        * calls uv_read_start() again.  By then, libuv has forgotten about the
1546        * hangup and the kernel won't report EPOLLIN again because there's
1547        * nothing left to read.  If anything, libuv is to blame here.  The
1548        * current hack is just a quick bandaid; to properly fix it, libuv
1549        * needs to remember the error/hangup event.  We should get that for
1550        * free when we switch over to edge-triggered I/O.
1551        */
1552       if (pe->events == POLLERR || pe->events == POLLHUP)
1553         pe->events |=
1554           w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1555 
1556       if (pe->events != 0) {
1557         /* Run signal watchers last.  This also affects child process watchers
1558          * because those are implemented in terms of signal watchers.
1559          */
1560         if (w == &loop->signal_io_watcher) {
1561           have_signals = 1;
1562         } else {
1563           uv__metrics_update_idle_time(loop);
1564           w->cb(loop, w, pe->events);
1565         }
1566 
1567         nevents++;
1568       }
1569     }
1570 
1571     uv__metrics_inc_events(loop, nevents);
1572     if (reset_timeout != 0) {
1573       timeout = user_timeout;
1574       reset_timeout = 0;
1575       uv__metrics_inc_events_waiting(loop, nevents);
1576     }
1577 
1578     if (have_signals != 0) {
1579       uv__metrics_update_idle_time(loop);
1580       loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
1581     }
1582 
1583     lfields->inv = NULL;
1584 
1585     if (have_iou_events != 0)
1586       break;  /* Event loop should cycle now so don't poll again. */
1587 
1588     if (have_signals != 0)
1589       break;  /* Event loop should cycle now so don't poll again. */
1590 
1591     if (nevents != 0) {
1592       if (nfds == ARRAY_SIZE(events) && --count != 0) {
1593         /* Poll for more events but don't block this time. */
1594         timeout = 0;
1595         continue;
1596       }
1597       break;
1598     }
1599 
1600 update_timeout:
1601     if (timeout == 0)
1602       break;
1603 
1604     if (timeout == -1)
1605       continue;
1606 
1607     assert(timeout > 0);
1608 
1609     real_timeout -= (loop->time - base);
1610     if (real_timeout <= 0)
1611       break;
1612 
1613     timeout = real_timeout;
1614   }
1615 
1616   if (ctl->ringfd != -1)
1617     while (*ctl->sqhead != *ctl->sqtail)
1618       uv__epoll_ctl_flush(epollfd, ctl, &prep);
1619 }
1620 
uv__hrtime(uv_clocktype_t type)1621 uint64_t uv__hrtime(uv_clocktype_t type) {
1622   static _Atomic clock_t fast_clock_id = -1;
1623   struct timespec t;
1624   clock_t clock_id;
1625 
1626   /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1627    * millisecond granularity or better.  CLOCK_MONOTONIC_COARSE is
1628    * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1629    * decide to make a costly system call.
1630    */
1631   /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1632    * when it has microsecond granularity or better (unlikely).
1633    */
1634   clock_id = CLOCK_MONOTONIC;
1635   if (type != UV_CLOCK_FAST)
1636     goto done;
1637 
1638   clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1639   if (clock_id != -1)
1640     goto done;
1641 
1642   clock_id = CLOCK_MONOTONIC;
1643   if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1644     if (t.tv_nsec <= 1 * 1000 * 1000)
1645       clock_id = CLOCK_MONOTONIC_COARSE;
1646 
1647   atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1648 
1649 done:
1650 
1651   if (clock_gettime(clock_id, &t))
1652     return 0;  /* Not really possible. */
1653 
1654   return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1655 }
1656 
1657 
uv_resident_set_memory(size_t * rss)1658 int uv_resident_set_memory(size_t* rss) {
1659   char buf[1024];
1660   const char* s;
1661   long val;
1662   int rc;
1663   int i;
1664 
1665   /* rss: 24th element */
1666   rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1667   if (rc < 0)
1668     return rc;
1669 
1670   /* find the last ')' */
1671   s = strrchr(buf, ')');
1672   if (s == NULL)
1673     goto err;
1674 
1675   for (i = 1; i <= 22; i++) {
1676     s = strchr(s + 1, ' ');
1677     if (s == NULL)
1678       goto err;
1679   }
1680 
1681   errno = 0;
1682   val = strtol(s, NULL, 10);
1683   if (val < 0 || errno != 0)
1684     goto err;
1685 
1686   *rss = val * getpagesize();
1687   return 0;
1688 
1689 err:
1690   return UV_EINVAL;
1691 }
1692 
uv_uptime(double * uptime)1693 int uv_uptime(double* uptime) {
1694   struct timespec now;
1695   char buf[128];
1696 
1697   /* Consult /proc/uptime when present (common case), or fall back to
1698    * clock_gettime. Why not always clock_gettime? It doesn't always return the
1699    * right result under OpenVZ and possibly other containerized environments.
1700    */
1701   if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1702     if (1 == sscanf(buf, "%lf", uptime))
1703       return 0;
1704 
1705   if (clock_gettime(CLOCK_BOOTTIME, &now))
1706     return UV__ERR(errno);
1707 
1708   *uptime = now.tv_sec;
1709   return 0;
1710 }
1711 
1712 
uv_cpu_info(uv_cpu_info_t ** ci,int * count)1713 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1714 #if defined(__PPC__)
1715   static const char model_marker[] = "cpu\t\t: ";
1716 #elif defined(__arm__)
1717   static const char model_marker[] = "Processor\t: ";
1718 #elif defined(__aarch64__)
1719   static const char model_marker[] = "CPU part\t: ";
1720 #elif defined(__mips__)
1721   static const char model_marker[] = "cpu model\t\t: ";
1722 #elif defined(__loongarch__)
1723   static const char model_marker[] = "cpu family\t\t: ";
1724 #else
1725   static const char model_marker[] = "model name\t: ";
1726 #endif
1727   static const char parts[] =
1728 #ifdef __aarch64__
1729     "0x811\nARM810\n"       "0x920\nARM920\n"      "0x922\nARM922\n"
1730     "0x926\nARM926\n"       "0x940\nARM940\n"      "0x946\nARM946\n"
1731     "0x966\nARM966\n"       "0xa20\nARM1020\n"      "0xa22\nARM1022\n"
1732     "0xa26\nARM1026\n"      "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1733     "0xb56\nARM1156\n"      "0xb76\nARM1176\n"      "0xc05\nCortex-A5\n"
1734     "0xc07\nCortex-A7\n"    "0xc08\nCortex-A8\n"    "0xc09\nCortex-A9\n"
1735     "0xc0d\nCortex-A17\n"   /* Originally A12 */
1736     "0xc0f\nCortex-A15\n"   "0xc0e\nCortex-A17\n"   "0xc14\nCortex-R4\n"
1737     "0xc15\nCortex-R5\n"    "0xc17\nCortex-R7\n"    "0xc18\nCortex-R8\n"
1738     "0xc20\nCortex-M0\n"    "0xc21\nCortex-M1\n"    "0xc23\nCortex-M3\n"
1739     "0xc24\nCortex-M4\n"    "0xc27\nCortex-M7\n"    "0xc60\nCortex-M0+\n"
1740     "0xd01\nCortex-A32\n"   "0xd03\nCortex-A53\n"   "0xd04\nCortex-A35\n"
1741     "0xd05\nCortex-A55\n"   "0xd06\nCortex-A65\n"   "0xd07\nCortex-A57\n"
1742     "0xd08\nCortex-A72\n"   "0xd09\nCortex-A73\n"   "0xd0a\nCortex-A75\n"
1743     "0xd0b\nCortex-A76\n"   "0xd0c\nNeoverse-N1\n"  "0xd0d\nCortex-A77\n"
1744     "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n"   "0xd20\nCortex-M23\n"
1745     "0xd21\nCortex-M33\n"   "0xd41\nCortex-A78\n"   "0xd42\nCortex-A78AE\n"
1746     "0xd4a\nNeoverse-E1\n"  "0xd4b\nCortex-A78C\n"
1747 #endif
1748     "";
1749   struct cpu {
1750     unsigned long long freq, user, nice, sys, idle, irq;
1751     unsigned model;
1752   };
1753   FILE* fp;
1754   char* p;
1755   int found;
1756   int n;
1757   unsigned i;
1758   unsigned cpu;
1759   unsigned maxcpu;
1760   unsigned size;
1761   unsigned long long skip;
1762   struct cpu (*cpus)[8192];  /* Kernel maximum. */
1763   struct cpu* c;
1764   struct cpu t;
1765   char (*model)[64];
1766   unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1767   /* Assumption: even big.LITTLE systems will have only a handful
1768    * of different CPU models. Most systems will just have one.
1769    */
1770   char models[8][64];
1771   char buf[1024];
1772 
1773   memset(bitmap, 0, sizeof(bitmap));
1774   memset(models, 0, sizeof(models));
1775   snprintf(*models, sizeof(*models), "unknown");
1776   maxcpu = 0;
1777 
1778   cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1779   if (cpus == NULL)
1780     return UV_ENOMEM;
1781 
1782   fp = uv__open_file("/proc/stat");
1783   if (fp == NULL) {
1784     uv__free(cpus);
1785     return UV__ERR(errno);
1786   }
1787 
1788   if (NULL == fgets(buf, sizeof(buf), fp))
1789     abort();
1790 
1791   for (;;) {
1792     memset(&t, 0, sizeof(t));
1793 
1794     n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1795                &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1796 
1797     if (n != 7)
1798       break;
1799 
1800     if (NULL == fgets(buf, sizeof(buf), fp))
1801       abort();
1802 
1803     if (cpu >= ARRAY_SIZE(*cpus))
1804       continue;
1805 
1806     (*cpus)[cpu] = t;
1807 
1808     bitmap[cpu >> 3] |= 1 << (cpu & 7);
1809 
1810     if (cpu >= maxcpu)
1811       maxcpu = cpu + 1;
1812   }
1813 
1814   fclose(fp);
1815 
1816   fp = uv__open_file("/proc/cpuinfo");
1817   if (fp == NULL)
1818     goto nocpuinfo;
1819 
1820   for (;;) {
1821     if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1822       break;  /* Parse error. */
1823 
1824     found = 0;
1825     while (!found && fgets(buf, sizeof(buf), fp))
1826       found = !strncmp(buf, model_marker, sizeof(model_marker) - 1);
1827 
1828     if (!found)
1829       goto next;
1830 
1831     p = buf + sizeof(model_marker) - 1;
1832     n = (int) strcspn(p, "\n");
1833 
1834     /* arm64: translate CPU part code to model name. */
1835     if (*parts) {
1836       p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1837       if (p == NULL)
1838         p = "unknown";
1839       else
1840         p += n + 1;
1841       n = (int) strcspn(p, "\n");
1842     }
1843 
1844     found = 0;
1845     for (model = models; !found && model < ARRAY_END(models); model++)
1846       found = !strncmp(p, *model, strlen(*model));
1847 
1848     if (!found)
1849       goto next;
1850 
1851     if (**model == '\0')
1852       snprintf(*model, sizeof(*model), "%.*s", n, p);
1853 
1854     if (cpu < maxcpu)
1855       (*cpus)[cpu].model = model - models;
1856 
1857 next:
1858     while (fgets(buf, sizeof(buf), fp))
1859       if (*buf == '\n')
1860         break;
1861   }
1862 
1863   fclose(fp);
1864   fp = NULL;
1865 
1866 nocpuinfo:
1867 
1868   n = 0;
1869   for (cpu = 0; cpu < maxcpu; cpu++) {
1870     if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1871       continue;
1872 
1873     n++;
1874     snprintf(buf, sizeof(buf),
1875              "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1876 
1877     fp = uv__open_file(buf);
1878     if (fp == NULL)
1879       continue;
1880 
1881     if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1882       abort();
1883     fclose(fp);
1884     fp = NULL;
1885   }
1886 
1887   size = n * sizeof(**ci) + sizeof(models);
1888   *ci = uv__malloc(size);
1889   *count = 0;
1890 
1891   if (*ci == NULL) {
1892     uv__free(cpus);
1893     return UV_ENOMEM;
1894   }
1895 
1896   *count = n;
1897   p = memcpy(*ci + n, models, sizeof(models));
1898 
1899   i = 0;
1900   for (cpu = 0; cpu < maxcpu; cpu++) {
1901     if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1902       continue;
1903 
1904     c = *cpus + cpu;
1905 
1906     (*ci)[i++] = (uv_cpu_info_t) {
1907       .model     = p + c->model * sizeof(*model),
1908       .speed     = c->freq / 1000,
1909       /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1910        * therefore the multiplier is always 1000/100 = 10.
1911        */
1912       .cpu_times = (struct uv_cpu_times_s) {
1913         .user = 10 * c->user,
1914         .nice = 10 * c->nice,
1915         .sys  = 10 * c->sys,
1916         .idle = 10 * c->idle,
1917         .irq  = 10 * c->irq,
1918       },
1919     };
1920   }
1921 
1922   uv__free(cpus);
1923 
1924   return 0;
1925 }
1926 
1927 
uv__ifaddr_exclude(struct ifaddrs * ent,int exclude_type)1928 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1929   if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1930     return 1;
1931   if (ent->ifa_addr == NULL)
1932     return 1;
1933   /*
1934    * On Linux getifaddrs returns information related to the raw underlying
1935    * devices. We're not interested in this information yet.
1936    */
1937   if (ent->ifa_addr->sa_family == PF_PACKET)
1938     return exclude_type;
1939   return !exclude_type;
1940 }
1941 
uv_interface_addresses(uv_interface_address_t ** addresses,int * count)1942 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1943   struct ifaddrs *addrs, *ent;
1944   uv_interface_address_t* address;
1945   int i;
1946   struct sockaddr_ll *sll;
1947 
1948   *count = 0;
1949   *addresses = NULL;
1950 
1951   if (getifaddrs(&addrs))
1952     return UV__ERR(errno);
1953 
1954   /* Count the number of interfaces */
1955   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1956     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1957       continue;
1958 
1959     (*count)++;
1960   }
1961 
1962   if (*count == 0) {
1963     freeifaddrs(addrs);
1964     return 0;
1965   }
1966 
1967   /* Make sure the memory is initiallized to zero using calloc() */
1968   *addresses = uv__calloc(*count, sizeof(**addresses));
1969   if (!(*addresses)) {
1970     freeifaddrs(addrs);
1971     return UV_ENOMEM;
1972   }
1973 
1974   address = *addresses;
1975 
1976   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1977     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1978       continue;
1979 
1980     address->name = uv__strdup(ent->ifa_name);
1981 
1982     if (ent->ifa_addr->sa_family == AF_INET6) {
1983       address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
1984     } else {
1985       address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
1986     }
1987 
1988     if (ent->ifa_netmask->sa_family == AF_INET6) {
1989       address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
1990     } else {
1991       address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
1992     }
1993 
1994     address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
1995 
1996     address++;
1997   }
1998 
1999   /* Fill in physical addresses for each interface */
2000   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
2001     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
2002       continue;
2003 
2004     address = *addresses;
2005 
2006     for (i = 0; i < (*count); i++) {
2007       size_t namelen = strlen(ent->ifa_name);
2008       /* Alias interface share the same physical address */
2009       if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
2010           (address->name[namelen] == 0 || address->name[namelen] == ':')) {
2011         sll = (struct sockaddr_ll*)ent->ifa_addr;
2012         memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
2013       }
2014       address++;
2015     }
2016   }
2017 
2018   freeifaddrs(addrs);
2019 
2020   return 0;
2021 }
2022 
2023 
uv_free_interface_addresses(uv_interface_address_t * addresses,int count)2024 void uv_free_interface_addresses(uv_interface_address_t* addresses,
2025   int count) {
2026   int i;
2027 
2028   for (i = 0; i < count; i++) {
2029     uv__free(addresses[i].name);
2030   }
2031 
2032   uv__free(addresses);
2033 }
2034 
2035 
uv__set_process_title(const char * title)2036 void uv__set_process_title(const char* title) {
2037 #if defined(PR_SET_NAME)
2038   prctl(PR_SET_NAME, title);  /* Only copies first 16 characters. */
2039 #endif
2040 }
2041 
2042 
uv__read_proc_meminfo(const char * what)2043 static uint64_t uv__read_proc_meminfo(const char* what) {
2044   uint64_t rc;
2045   char* p;
2046   char buf[4096];  /* Large enough to hold all of /proc/meminfo. */
2047 
2048   if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2049     return 0;
2050 
2051   p = strstr(buf, what);
2052 
2053   if (p == NULL)
2054     return 0;
2055 
2056   p += strlen(what);
2057 
2058   rc = 0;
2059   sscanf(p, "%" PRIu64 " kB", &rc);
2060 
2061   return rc * 1024;
2062 }
2063 
2064 
uv_get_free_memory(void)2065 uint64_t uv_get_free_memory(void) {
2066   struct sysinfo info;
2067   uint64_t rc;
2068 
2069   rc = uv__read_proc_meminfo("MemAvailable:");
2070 
2071   if (rc != 0)
2072     return rc;
2073 
2074   if (0 == sysinfo(&info))
2075     return (uint64_t) info.freeram * info.mem_unit;
2076 
2077   return 0;
2078 }
2079 
2080 
uv_get_total_memory(void)2081 uint64_t uv_get_total_memory(void) {
2082   struct sysinfo info;
2083   uint64_t rc;
2084 
2085   rc = uv__read_proc_meminfo("MemTotal:");
2086 
2087   if (rc != 0)
2088     return rc;
2089 
2090   if (0 == sysinfo(&info))
2091     return (uint64_t) info.totalram * info.mem_unit;
2092 
2093   return 0;
2094 }
2095 
2096 
uv__read_uint64(const char * filename)2097 static uint64_t uv__read_uint64(const char* filename) {
2098   char buf[32];  /* Large enough to hold an encoded uint64_t. */
2099   uint64_t rc;
2100 
2101   rc = 0;
2102   if (0 == uv__slurp(filename, buf, sizeof(buf)))
2103     if (1 != sscanf(buf, "%" PRIu64, &rc))
2104       if (0 == strcmp(buf, "max\n"))
2105         rc = UINT64_MAX;
2106 
2107   return rc;
2108 }
2109 
2110 
2111 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2112  * finds the location and length of the memory controller mount path.
2113  * This disregards the leading / for easy concatenation of paths.
2114  * Returns NULL if the memory controller wasn't found. */
uv__cgroup1_find_memory_controller(char buf[static1024],int * n)2115 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2116                                                 int* n) {
2117   char* p;
2118 
2119   /* Seek to the memory controller line. */
2120   p = strchr(buf, ':');
2121   while (p != NULL && strncmp(p, ":memory:", 8)) {
2122     p = strchr(p, '\n');
2123     if (p != NULL)
2124       p = strchr(p, ':');
2125   }
2126 
2127   if (p != NULL) {
2128     /* Determine the length of the mount path. */
2129     p = p + strlen(":memory:/");
2130     *n = (int) strcspn(p, "\n");
2131   }
2132 
2133   return p;
2134 }
2135 
uv__get_cgroup1_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2136 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2137                                           uint64_t* max) {
2138   char filename[4097];
2139   char* p;
2140   int n;
2141   uint64_t cgroup1_max;
2142 
2143   /* Find out where the controller is mounted. */
2144   p = uv__cgroup1_find_memory_controller(buf, &n);
2145   if (p != NULL) {
2146     snprintf(filename, sizeof(filename),
2147              "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2148     *high = uv__read_uint64(filename);
2149 
2150     snprintf(filename, sizeof(filename),
2151              "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2152     *max = uv__read_uint64(filename);
2153 
2154     /* If the controller wasn't mounted, the reads above will have failed,
2155      * as indicated by uv__read_uint64 returning 0.
2156      */
2157      if (*high != 0 && *max != 0)
2158        goto update_limits;
2159   }
2160 
2161   /* Fall back to the limits of the global memory controller. */
2162   *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2163   *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2164 
2165   /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2166    * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2167    */
2168 update_limits:
2169   cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2170   if (*high == cgroup1_max)
2171     *high = UINT64_MAX;
2172   if (*max == cgroup1_max)
2173     *max = UINT64_MAX;
2174 }
2175 
uv__get_cgroup2_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2176 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2177                                           uint64_t* max) {
2178   char filename[4097];
2179   char* p;
2180   int n;
2181 
2182   /* Find out where the controller is mounted. */
2183   p = buf + strlen("0::/");
2184   n = (int) strcspn(p, "\n");
2185 
2186   /* Read the memory limits of the controller. */
2187   snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2188   *max = uv__read_uint64(filename);
2189   snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2190   *high = uv__read_uint64(filename);
2191 }
2192 
uv__get_cgroup_constrained_memory(char buf[static1024])2193 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2194   uint64_t high;
2195   uint64_t max;
2196 
2197   /* In the case of cgroupv2, we'll only have a single entry. */
2198   if (strncmp(buf, "0::/", 4))
2199     uv__get_cgroup1_memory_limits(buf, &high, &max);
2200   else
2201     uv__get_cgroup2_memory_limits(buf, &high, &max);
2202 
2203   if (high == 0 || max == 0)
2204     return 0;
2205 
2206   return high < max ? high : max;
2207 }
2208 
uv_get_constrained_memory(void)2209 uint64_t uv_get_constrained_memory(void) {
2210   char buf[1024];
2211 
2212   if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2213     return 0;
2214 
2215   return uv__get_cgroup_constrained_memory(buf);
2216 }
2217 
2218 
uv__get_cgroup1_current_memory(char buf[static1024])2219 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2220   char filename[4097];
2221   uint64_t current;
2222   char* p;
2223   int n;
2224 
2225   /* Find out where the controller is mounted. */
2226   p = uv__cgroup1_find_memory_controller(buf, &n);
2227   if (p != NULL) {
2228     snprintf(filename, sizeof(filename),
2229             "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2230     current = uv__read_uint64(filename);
2231 
2232     /* If the controller wasn't mounted, the reads above will have failed,
2233      * as indicated by uv__read_uint64 returning 0.
2234      */
2235     if (current != 0)
2236       return current;
2237   }
2238 
2239   /* Fall back to the usage of the global memory controller. */
2240   return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2241 }
2242 
uv__get_cgroup2_current_memory(char buf[static1024])2243 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2244   char filename[4097];
2245   char* p;
2246   int n;
2247 
2248   /* Find out where the controller is mounted. */
2249   p = buf + strlen("0::/");
2250   n = (int) strcspn(p, "\n");
2251 
2252   snprintf(filename, sizeof(filename),
2253            "/sys/fs/cgroup/%.*s/memory.current", n, p);
2254   return uv__read_uint64(filename);
2255 }
2256 
uv_get_available_memory(void)2257 uint64_t uv_get_available_memory(void) {
2258   char buf[1024];
2259   uint64_t constrained;
2260   uint64_t current;
2261   uint64_t total;
2262 
2263   if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2264     return 0;
2265 
2266   constrained = uv__get_cgroup_constrained_memory(buf);
2267   if (constrained == 0)
2268     return uv_get_free_memory();
2269 
2270   total = uv_get_total_memory();
2271   if (constrained > total)
2272     return uv_get_free_memory();
2273 
2274   /* In the case of cgroupv2, we'll only have a single entry. */
2275   if (strncmp(buf, "0::/", 4))
2276     current = uv__get_cgroup1_current_memory(buf);
2277   else
2278     current = uv__get_cgroup2_current_memory(buf);
2279 
2280   /* memory usage can be higher than the limit (for short bursts of time) */
2281   if (constrained < current)
2282     return 0;
2283 
2284   return constrained - current;
2285 }
2286 
2287 
uv__get_cgroupv2_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2288 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2289                                             uv__cpu_constraint* constraint) {
2290   char path[256];
2291   char buf[1024];
2292   unsigned int weight;
2293   int cgroup_size;
2294   const char* cgroup_trimmed;
2295   char quota_buf[16];
2296 
2297   if (strncmp(cgroup, "0::/", 4) != 0)
2298     return UV_EINVAL;
2299 
2300   /* Trim ending \n by replacing it with a 0 */
2301   cgroup_trimmed = cgroup + sizeof("0::/") - 1;      /* Skip the prefix "0::/" */
2302   cgroup_size = (int)strcspn(cgroup_trimmed, "\n");  /* Find the first slash */
2303 
2304   /* Construct the path to the cpu.max file */
2305   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.max", cgroup_size,
2306            cgroup_trimmed);
2307 
2308   /* Read cpu.max */
2309   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2310     return UV_EIO;
2311 
2312   if (sscanf(buf, "%15s %llu", quota_buf, &constraint->period_length) != 2)
2313     return UV_EINVAL;
2314 
2315   if (strncmp(quota_buf, "max", 3) == 0)
2316     constraint->quota_per_period = LLONG_MAX;
2317   else if (sscanf(quota_buf, "%lld", &constraint->quota_per_period) != 1)
2318     return UV_EINVAL; // conversion failed
2319 
2320   /* Construct the path to the cpu.weight file */
2321   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.weight", cgroup_size,
2322            cgroup_trimmed);
2323 
2324   /* Read cpu.weight */
2325   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2326     return UV_EIO;
2327 
2328   if (sscanf(buf, "%u", &weight) != 1)
2329     return UV_EINVAL;
2330 
2331   constraint->proportions = (double)weight / 100.0;
2332 
2333   return 0;
2334 }
2335 
uv__cgroup1_find_cpu_controller(const char * cgroup,int * cgroup_size)2336 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2337                                              int* cgroup_size) {
2338   /* Seek to the cpu controller line. */
2339   char* cgroup_cpu = strstr(cgroup, ":cpu,");
2340 
2341   if (cgroup_cpu != NULL) {
2342     /* Skip the controller prefix to the start of the cgroup path. */
2343     cgroup_cpu += sizeof(":cpu,") - 1;
2344     /* Determine the length of the cgroup path, excluding the newline. */
2345     *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2346   }
2347 
2348   return cgroup_cpu;
2349 }
2350 
uv__get_cgroupv1_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2351 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2352                                             uv__cpu_constraint* constraint) {
2353   char path[256];
2354   char buf[1024];
2355   unsigned int shares;
2356   int cgroup_size;
2357   char* cgroup_cpu;
2358 
2359   cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2360 
2361   if (cgroup_cpu == NULL)
2362     return UV_EIO;
2363 
2364   /* Construct the path to the cpu.cfs_quota_us file */
2365   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2366            cgroup_size, cgroup_cpu);
2367 
2368   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2369     return UV_EIO;
2370 
2371   if (sscanf(buf, "%lld", &constraint->quota_per_period) != 1)
2372     return UV_EINVAL;
2373 
2374   /* Construct the path to the cpu.cfs_period_us file */
2375   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2376            cgroup_size, cgroup_cpu);
2377 
2378   /* Read cpu.cfs_period_us */
2379   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2380     return UV_EIO;
2381 
2382   if (sscanf(buf, "%lld", &constraint->period_length) != 1)
2383     return UV_EINVAL;
2384 
2385   /* Construct the path to the cpu.shares file */
2386   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.shares", cgroup_size,
2387            cgroup_cpu);
2388 
2389   /* Read cpu.shares */
2390   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2391     return UV_EIO;
2392 
2393   if (sscanf(buf, "%u", &shares) != 1)
2394     return UV_EINVAL;
2395 
2396   constraint->proportions = (double)shares / 1024.0;
2397 
2398   return 0;
2399 }
2400 
uv__get_constrained_cpu(uv__cpu_constraint * constraint)2401 int uv__get_constrained_cpu(uv__cpu_constraint* constraint) {
2402   char cgroup[1024];
2403 
2404   /* Read the cgroup from /proc/self/cgroup */
2405   if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2406     return UV_EIO;
2407 
2408   /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2409    * The entry for cgroup v2 is always in the format "0::$PATH"
2410    * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2411   if (strncmp(cgroup, "0::/", 4) == 0)
2412     return uv__get_cgroupv2_constrained_cpu(cgroup, constraint);
2413   else
2414     return uv__get_cgroupv1_constrained_cpu(cgroup, constraint);
2415 }
2416 
2417 
uv_loadavg(double avg[3])2418 void uv_loadavg(double avg[3]) {
2419   struct sysinfo info;
2420   char buf[128];  /* Large enough to hold all of /proc/loadavg. */
2421 
2422   if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2423     if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2424       return;
2425 
2426   if (sysinfo(&info) < 0)
2427     return;
2428 
2429   avg[0] = (double) info.loads[0] / 65536.0;
2430   avg[1] = (double) info.loads[1] / 65536.0;
2431   avg[2] = (double) info.loads[2] / 65536.0;
2432 }
2433 
2434 
compare_watchers(const struct watcher_list * a,const struct watcher_list * b)2435 static int compare_watchers(const struct watcher_list* a,
2436                             const struct watcher_list* b) {
2437   if (a->wd < b->wd) return -1;
2438   if (a->wd > b->wd) return 1;
2439   return 0;
2440 }
2441 
2442 
init_inotify(uv_loop_t * loop)2443 static int init_inotify(uv_loop_t* loop) {
2444   int fd;
2445 
2446   if (loop->inotify_fd != -1)
2447     return 0;
2448 
2449   fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2450   if (fd < 0)
2451     return UV__ERR(errno);
2452 
2453   loop->inotify_fd = fd;
2454   uv__io_init(&loop->inotify_read_watcher, uv__inotify_read, loop->inotify_fd);
2455   uv__io_start(loop, &loop->inotify_read_watcher, POLLIN);
2456 
2457   return 0;
2458 }
2459 
2460 
uv__inotify_fork(uv_loop_t * loop,struct watcher_list * root)2461 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2462   /* Open the inotify_fd, and re-arm all the inotify watchers. */
2463   int err;
2464   struct watcher_list* tmp_watcher_list_iter;
2465   struct watcher_list* watcher_list;
2466   struct watcher_list tmp_watcher_list;
2467   struct uv__queue queue;
2468   struct uv__queue* q;
2469   uv_fs_event_t* handle;
2470   char* tmp_path;
2471 
2472   if (root == NULL)
2473     return 0;
2474 
2475   /* We must restore the old watcher list to be able to close items
2476    * out of it.
2477    */
2478   loop->inotify_watchers = root;
2479 
2480   uv__queue_init(&tmp_watcher_list.watchers);
2481   /* Note that the queue we use is shared with the start and stop()
2482    * functions, making uv__queue_foreach unsafe to use. So we use the
2483    * uv__queue_move trick to safely iterate. Also don't free the watcher
2484    * list until we're done iterating. c.f. uv__inotify_read.
2485    */
2486   RB_FOREACH_SAFE(watcher_list, watcher_root,
2487                   uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2488     watcher_list->iterating = 1;
2489     uv__queue_move(&watcher_list->watchers, &queue);
2490     while (!uv__queue_empty(&queue)) {
2491       q = uv__queue_head(&queue);
2492       handle = uv__queue_data(q, uv_fs_event_t, watchers);
2493       /* It's critical to keep a copy of path here, because it
2494        * will be set to NULL by stop() and then deallocated by
2495        * maybe_free_watcher_list
2496        */
2497       tmp_path = uv__strdup(handle->path);
2498       assert(tmp_path != NULL);
2499       uv__queue_remove(q);
2500       uv__queue_insert_tail(&watcher_list->watchers, q);
2501       uv_fs_event_stop(handle);
2502 
2503       uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2504       handle->path = tmp_path;
2505     }
2506     watcher_list->iterating = 0;
2507     maybe_free_watcher_list(watcher_list, loop);
2508   }
2509 
2510   uv__queue_move(&tmp_watcher_list.watchers, &queue);
2511   while (!uv__queue_empty(&queue)) {
2512       q = uv__queue_head(&queue);
2513       uv__queue_remove(q);
2514       handle = uv__queue_data(q, uv_fs_event_t, watchers);
2515       tmp_path = handle->path;
2516       handle->path = NULL;
2517       err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2518       uv__free(tmp_path);
2519       if (err)
2520         return err;
2521   }
2522 
2523   return 0;
2524 }
2525 
2526 
find_watcher(uv_loop_t * loop,int wd)2527 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2528   struct watcher_list w;
2529   w.wd = wd;
2530   return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2531 }
2532 
2533 
maybe_free_watcher_list(struct watcher_list * w,uv_loop_t * loop)2534 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2535   /* if the watcher_list->watchers is being iterated over, we can't free it. */
2536   if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2537     /* No watchers left for this path. Clean up. */
2538     RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2539     inotify_rm_watch(loop->inotify_fd, w->wd);
2540     uv__free(w);
2541   }
2542 }
2543 
2544 
uv__inotify_read(uv_loop_t * loop,uv__io_t * dummy,unsigned int events)2545 static void uv__inotify_read(uv_loop_t* loop,
2546                              uv__io_t* dummy,
2547                              unsigned int events) {
2548   const struct inotify_event* e;
2549   struct watcher_list* w;
2550   uv_fs_event_t* h;
2551   struct uv__queue queue;
2552   struct uv__queue* q;
2553   const char* path;
2554   ssize_t size;
2555   const char *p;
2556   /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2557   char buf[4096];
2558 
2559   for (;;) {
2560     do
2561       size = read(loop->inotify_fd, buf, sizeof(buf));
2562     while (size == -1 && errno == EINTR);
2563 
2564     if (size == -1) {
2565       assert(errno == EAGAIN || errno == EWOULDBLOCK);
2566       break;
2567     }
2568 
2569     assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2570 
2571     /* Now we have one or more inotify_event structs. */
2572     for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2573       e = (const struct inotify_event*) p;
2574 
2575       events = 0;
2576       if (e->mask & (IN_ATTRIB|IN_MODIFY))
2577         events |= UV_CHANGE;
2578       if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2579         events |= UV_RENAME;
2580 
2581       w = find_watcher(loop, e->wd);
2582       if (w == NULL)
2583         continue; /* Stale event, no watchers left. */
2584 
2585       /* inotify does not return the filename when monitoring a single file
2586        * for modifications. Repurpose the filename for API compatibility.
2587        * I'm not convinced this is a good thing, maybe it should go.
2588        */
2589       path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2590 
2591       /* We're about to iterate over the queue and call user's callbacks.
2592        * What can go wrong?
2593        * A callback could call uv_fs_event_stop()
2594        * and the queue can change under our feet.
2595        * So, we use uv__queue_move() trick to safely iterate over the queue.
2596        * And we don't free the watcher_list until we're done iterating.
2597        *
2598        * First,
2599        * tell uv_fs_event_stop() (that could be called from a user's callback)
2600        * not to free watcher_list.
2601        */
2602       w->iterating = 1;
2603       uv__queue_move(&w->watchers, &queue);
2604       while (!uv__queue_empty(&queue)) {
2605         q = uv__queue_head(&queue);
2606         h = uv__queue_data(q, uv_fs_event_t, watchers);
2607 
2608         uv__queue_remove(q);
2609         uv__queue_insert_tail(&w->watchers, q);
2610 
2611         h->cb(h, path, events, 0);
2612       }
2613       /* done iterating, time to (maybe) free empty watcher_list */
2614       w->iterating = 0;
2615       maybe_free_watcher_list(w, loop);
2616     }
2617   }
2618 }
2619 
2620 
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)2621 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2622   uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2623   return 0;
2624 }
2625 
2626 
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)2627 int uv_fs_event_start(uv_fs_event_t* handle,
2628                       uv_fs_event_cb cb,
2629                       const char* path,
2630                       unsigned int flags) {
2631   struct watcher_list* w;
2632   uv_loop_t* loop;
2633   size_t len;
2634   int events;
2635   int err;
2636   int wd;
2637 
2638   if (uv__is_active(handle))
2639     return UV_EINVAL;
2640 
2641   loop = handle->loop;
2642 
2643   err = init_inotify(loop);
2644   if (err)
2645     return err;
2646 
2647   events = IN_ATTRIB
2648          | IN_CREATE
2649          | IN_MODIFY
2650          | IN_DELETE
2651          | IN_DELETE_SELF
2652          | IN_MOVE_SELF
2653          | IN_MOVED_FROM
2654          | IN_MOVED_TO;
2655 
2656   wd = inotify_add_watch(loop->inotify_fd, path, events);
2657   if (wd == -1)
2658     return UV__ERR(errno);
2659 
2660   w = find_watcher(loop, wd);
2661   if (w)
2662     goto no_insert;
2663 
2664   len = strlen(path) + 1;
2665   w = uv__malloc(sizeof(*w) + len);
2666   if (w == NULL)
2667     return UV_ENOMEM;
2668 
2669   w->wd = wd;
2670   w->path = memcpy(w + 1, path, len);
2671   uv__queue_init(&w->watchers);
2672   w->iterating = 0;
2673   RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2674 
2675 no_insert:
2676   uv__handle_start(handle);
2677   uv__queue_insert_tail(&w->watchers, &handle->watchers);
2678   handle->path = w->path;
2679   handle->cb = cb;
2680   handle->wd = wd;
2681 
2682   return 0;
2683 }
2684 
2685 
uv_fs_event_stop(uv_fs_event_t * handle)2686 int uv_fs_event_stop(uv_fs_event_t* handle) {
2687   struct watcher_list* w;
2688 
2689   if (!uv__is_active(handle))
2690     return 0;
2691 
2692   w = find_watcher(handle->loop, handle->wd);
2693   assert(w != NULL);
2694 
2695   handle->wd = -1;
2696   handle->path = NULL;
2697   uv__handle_stop(handle);
2698   uv__queue_remove(&handle->watchers);
2699 
2700   maybe_free_watcher_list(w, handle->loop);
2701 
2702   return 0;
2703 }
2704 
2705 
uv__fs_event_close(uv_fs_event_t * handle)2706 void uv__fs_event_close(uv_fs_event_t* handle) {
2707   uv_fs_event_stop(handle);
2708 }
2709