xref: /libuv/src/unix/linux.c (revision 69bad820)
1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2  * Permission is hereby granted, free of charge, to any person obtaining a copy
3  * of this software and associated documentation files (the "Software"), to
4  * deal in the Software without restriction, including without limitation the
5  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6  * sell copies of the Software, and to permit persons to whom the Software is
7  * furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in
10  * all copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18  * IN THE SOFTWARE.
19  */
20 
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22  * EPOLL* counterparts.  We use the POLL* variants in this file because that
23  * is what libuv uses elsewhere.
24  */
25 
26 #include "uv.h"
27 #include "internal.h"
28 
29 #include <inttypes.h>
30 #include <stdatomic.h>
31 #include <stddef.h>  /* offsetof */
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <errno.h>
38 
39 #include <fcntl.h>
40 #include <ifaddrs.h>
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <netpacket/packet.h>
44 #include <sys/epoll.h>
45 #include <sys/inotify.h>
46 #include <sys/mman.h>
47 #include <sys/param.h>
48 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/stat.h>
51 #include <sys/syscall.h>
52 #include <sys/sysinfo.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/utsname.h>
56 #include <time.h>
57 #include <unistd.h>
58 
59 #ifndef __NR_io_uring_setup
60 # define __NR_io_uring_setup 425
61 #endif
62 
63 #ifndef __NR_io_uring_enter
64 # define __NR_io_uring_enter 426
65 #endif
66 
67 #ifndef __NR_io_uring_register
68 # define __NR_io_uring_register 427
69 #endif
70 
71 #ifndef __NR_copy_file_range
72 # if defined(__x86_64__)
73 #  define __NR_copy_file_range 326
74 # elif defined(__i386__)
75 #  define __NR_copy_file_range 377
76 # elif defined(__s390__)
77 #  define __NR_copy_file_range 375
78 # elif defined(__arm__)
79 #  define __NR_copy_file_range 391
80 # elif defined(__aarch64__)
81 #  define __NR_copy_file_range 285
82 # elif defined(__powerpc__)
83 #  define __NR_copy_file_range 379
84 # elif defined(__arc__)
85 #  define __NR_copy_file_range 285
86 # elif defined(__riscv)
87 #  define __NR_copy_file_range 285
88 # endif
89 #endif /* __NR_copy_file_range */
90 
91 #ifndef __NR_statx
92 # if defined(__x86_64__)
93 #  define __NR_statx 332
94 # elif defined(__i386__)
95 #  define __NR_statx 383
96 # elif defined(__aarch64__)
97 #  define __NR_statx 397
98 # elif defined(__arm__)
99 #  define __NR_statx 397
100 # elif defined(__ppc__)
101 #  define __NR_statx 383
102 # elif defined(__s390__)
103 #  define __NR_statx 379
104 # elif defined(__riscv)
105 #  define __NR_statx 291
106 # endif
107 #endif /* __NR_statx */
108 
109 #ifndef __NR_getrandom
110 # if defined(__x86_64__)
111 #  define __NR_getrandom 318
112 # elif defined(__i386__)
113 #  define __NR_getrandom 355
114 # elif defined(__aarch64__)
115 #  define __NR_getrandom 384
116 # elif defined(__arm__)
117 #  define __NR_getrandom 384
118 # elif defined(__ppc__)
119 #  define __NR_getrandom 359
120 # elif defined(__s390__)
121 #  define __NR_getrandom 349
122 # elif defined(__riscv)
123 #  define __NR_getrandom 278
124 # endif
125 #endif /* __NR_getrandom */
126 
127 enum {
128   UV__IORING_SETUP_SQPOLL = 2u,
129   UV__IORING_SETUP_NO_SQARRAY = 0x10000u,
130 };
131 
132 enum {
133   UV__IORING_FEAT_SINGLE_MMAP = 1u,
134   UV__IORING_FEAT_NODROP = 2u,
135   UV__IORING_FEAT_RSRC_TAGS = 1024u,  /* linux v5.13 */
136 };
137 
138 enum {
139   UV__IORING_OP_READV = 1,
140   UV__IORING_OP_WRITEV = 2,
141   UV__IORING_OP_FSYNC = 3,
142   UV__IORING_OP_OPENAT = 18,
143   UV__IORING_OP_CLOSE = 19,
144   UV__IORING_OP_STATX = 21,
145   UV__IORING_OP_EPOLL_CTL = 29,
146   UV__IORING_OP_RENAMEAT = 35,
147   UV__IORING_OP_UNLINKAT = 36,
148   UV__IORING_OP_MKDIRAT = 37,
149   UV__IORING_OP_SYMLINKAT = 38,
150   UV__IORING_OP_LINKAT = 39,
151   UV__IORING_OP_FTRUNCATE = 55,
152 };
153 
154 enum {
155   UV__IORING_ENTER_GETEVENTS = 1u,
156   UV__IORING_ENTER_SQ_WAKEUP = 2u,
157 };
158 
159 enum {
160   UV__IORING_SQ_NEED_WAKEUP = 1u,
161   UV__IORING_SQ_CQ_OVERFLOW = 2u,
162 };
163 
164 struct uv__io_cqring_offsets {
165   uint32_t head;
166   uint32_t tail;
167   uint32_t ring_mask;
168   uint32_t ring_entries;
169   uint32_t overflow;
170   uint32_t cqes;
171   uint64_t reserved0;
172   uint64_t reserved1;
173 };
174 
175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
176 
177 struct uv__io_sqring_offsets {
178   uint32_t head;
179   uint32_t tail;
180   uint32_t ring_mask;
181   uint32_t ring_entries;
182   uint32_t flags;
183   uint32_t dropped;
184   uint32_t array;
185   uint32_t reserved0;
186   uint64_t reserved1;
187 };
188 
189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
190 
191 struct uv__io_uring_cqe {
192   uint64_t user_data;
193   int32_t res;
194   uint32_t flags;
195 };
196 
197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
198 
199 struct uv__io_uring_sqe {
200   uint8_t opcode;
201   uint8_t flags;
202   uint16_t ioprio;
203   int32_t fd;
204   union {
205     uint64_t off;
206     uint64_t addr2;
207   };
208   union {
209     uint64_t addr;
210   };
211   uint32_t len;
212   union {
213     uint32_t rw_flags;
214     uint32_t fsync_flags;
215     uint32_t open_flags;
216     uint32_t statx_flags;
217   };
218   uint64_t user_data;
219   union {
220     uint16_t buf_index;
221     uint64_t pad[3];
222   };
223 };
224 
225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
236 
237 struct uv__io_uring_params {
238   uint32_t sq_entries;
239   uint32_t cq_entries;
240   uint32_t flags;
241   uint32_t sq_thread_cpu;
242   uint32_t sq_thread_idle;
243   uint32_t features;
244   uint32_t reserved[4];
245   struct uv__io_sqring_offsets sq_off;  /* 40 bytes */
246   struct uv__io_cqring_offsets cq_off;  /* 40 bytes */
247 };
248 
249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
252 
253 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
254 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
255 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
256 
257 struct watcher_list {
258   RB_ENTRY(watcher_list) entry;
259   struct uv__queue watchers;
260   int iterating;
261   char* path;
262   int wd;
263 };
264 
265 struct watcher_root {
266   struct watcher_list* rbh_root;
267 };
268 
269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
270 static void uv__inotify_read(uv_loop_t* loop,
271                              uv__io_t* w,
272                              unsigned int revents);
273 static int compare_watchers(const struct watcher_list* a,
274                             const struct watcher_list* b);
275 static void maybe_free_watcher_list(struct watcher_list* w,
276                                     uv_loop_t* loop);
277 
278 static void uv__epoll_ctl_flush(int epollfd,
279                                 struct uv__iou* ctl,
280                                 struct epoll_event (*events)[256]);
281 
282 static void uv__epoll_ctl_prep(int epollfd,
283                                struct uv__iou* ctl,
284                                struct epoll_event (*events)[256],
285                                int op,
286                                int fd,
287                                struct epoll_event* e);
288 
RB_GENERATE_STATIC(watcher_root,watcher_list,entry,compare_watchers)289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
290 
291 
292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
293   /* This cast works because watcher_root is a struct with a pointer as its
294    * sole member. Such type punning is unsafe in the presence of strict
295    * pointer aliasing (and is just plain nasty) but that is why libuv
296    * is compiled with -fno-strict-aliasing.
297    */
298   return (struct watcher_root*) &loop->inotify_watchers;
299 }
300 
301 
uv__kernel_version(void)302 unsigned uv__kernel_version(void) {
303   static _Atomic unsigned cached_version;
304   struct utsname u;
305   unsigned version;
306   unsigned major;
307   unsigned minor;
308   unsigned patch;
309   char v_sig[256];
310   char* needle;
311 
312   version = atomic_load_explicit(&cached_version, memory_order_relaxed);
313   if (version != 0)
314     return version;
315 
316   /* Check /proc/version_signature first as it's the way to get the mainline
317    * kernel version in Ubuntu. The format is:
318    *   Ubuntu ubuntu_kernel_version mainline_kernel_version
319    * For example:
320    *   Ubuntu 5.15.0-79.86-generic 5.15.111
321    */
322   if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
323     if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
324       goto calculate_version;
325 
326   if (-1 == uname(&u))
327     return 0;
328 
329   /* In Debian we need to check `version` instead of `release` to extract the
330    * mainline kernel version. This is an example of how it looks like:
331    *  #1 SMP Debian 5.10.46-4 (2021-08-03)
332    */
333   needle = strstr(u.version, "Debian ");
334   if (needle != NULL)
335     if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
336       goto calculate_version;
337 
338   if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
339     return 0;
340 
341   /* Handle it when the process runs under the UNAME26 personality:
342    *
343    * - kernels >= 3.x identify as 2.6.40+x
344    * - kernels >= 4.x identify as 2.6.60+x
345    *
346    * UNAME26 is a poorly conceived hack that doesn't let us distinguish
347    * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
348    * that 2.6.60+x means 4.x.
349    *
350    * Fun fact of the day: it's technically possible to observe the actual
351    * kernel version for a brief moment because uname() first copies out the
352    * real release string before overwriting it with the backcompat string.
353    */
354   if (major == 2 && minor == 6) {
355     if (patch >= 60) {
356       major = 4;
357       minor = patch - 60;
358       patch = 0;
359     } else if (patch >= 40) {
360       major = 3;
361       minor = patch - 40;
362       patch = 0;
363     }
364   }
365 
366 calculate_version:
367   version = major * 65536 + minor * 256 + patch;
368   atomic_store_explicit(&cached_version, version, memory_order_relaxed);
369 
370   return version;
371 }
372 
373 
374 ssize_t
uv__fs_copy_file_range(int fd_in,off_t * off_in,int fd_out,off_t * off_out,size_t len,unsigned int flags)375 uv__fs_copy_file_range(int fd_in,
376                        off_t* off_in,
377                        int fd_out,
378                        off_t* off_out,
379                        size_t len,
380                        unsigned int flags)
381 {
382 #ifdef __NR_copy_file_range
383   return syscall(__NR_copy_file_range,
384                  fd_in,
385                  off_in,
386                  fd_out,
387                  off_out,
388                  len,
389                  flags);
390 #else
391   return errno = ENOSYS, -1;
392 #endif
393 }
394 
395 
uv__statx(int dirfd,const char * path,int flags,unsigned int mask,struct uv__statx * statxbuf)396 int uv__statx(int dirfd,
397               const char* path,
398               int flags,
399               unsigned int mask,
400               struct uv__statx* statxbuf) {
401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
402   return errno = ENOSYS, -1;
403 #else
404   int rc;
405 
406   rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
407   if (rc >= 0)
408     uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
409 
410   return rc;
411 #endif
412 }
413 
414 
uv__getrandom(void * buf,size_t buflen,unsigned flags)415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
417   return errno = ENOSYS, -1;
418 #else
419   ssize_t rc;
420 
421   rc = syscall(__NR_getrandom, buf, buflen, flags);
422   if (rc >= 0)
423     uv__msan_unpoison(buf, buflen);
424 
425   return rc;
426 #endif
427 }
428 
429 
uv__io_uring_setup(int entries,struct uv__io_uring_params * params)430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
431   return syscall(__NR_io_uring_setup, entries, params);
432 }
433 
434 
uv__io_uring_enter(int fd,unsigned to_submit,unsigned min_complete,unsigned flags)435 int uv__io_uring_enter(int fd,
436                        unsigned to_submit,
437                        unsigned min_complete,
438                        unsigned flags) {
439   /* io_uring_enter used to take a sigset_t but it's unused
440    * in newer kernels unless IORING_ENTER_EXT_ARG is set,
441    * in which case it takes a struct io_uring_getevents_arg.
442    */
443   return syscall(__NR_io_uring_enter,
444                  fd,
445                  to_submit,
446                  min_complete,
447                  flags,
448                  NULL,
449                  0L);
450 }
451 
452 
uv__io_uring_register(int fd,unsigned opcode,void * arg,unsigned nargs)453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
454   return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
455 }
456 
457 
uv__use_io_uring(uint32_t flags)458 static int uv__use_io_uring(uint32_t flags) {
459 #if defined(__ANDROID_API__)
460   return 0;  /* Possibly available but blocked by seccomp. */
461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
462   /* See https://github.com/libuv/libuv/issues/4158. */
463   return 0;  /* All 32 bits kernels appear buggy. */
464 #elif defined(__powerpc64__) || defined(__ppc64__)
465   /* See https://github.com/libuv/libuv/issues/4283. */
466   return 0; /* Random SIGSEGV in signal handler. */
467 #else
468   /* Ternary: unknown=0, yes=1, no=-1 */
469   static _Atomic int use_io_uring;
470   char* val;
471   int use;
472 
473 #if defined(__hppa__)
474   /* io_uring first supported on parisc in 6.1, functional in .51
475    * https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/
476    */
477   if (uv__kernel_version() < /*6.1.51*/0x060133)
478     return 0;
479 #endif
480 
481   /* SQPOLL is all kinds of buggy but epoll batching should work fine. */
482   if (0 == (flags & UV__IORING_SETUP_SQPOLL))
483     return 1;
484 
485   /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
486   if (uv__kernel_version() < /*5.10.186*/0x050ABA)
487     return 0;
488 
489   use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
490 
491   if (use == 0) {
492     val = getenv("UV_USE_IO_URING");
493     use = val != NULL && atoi(val) > 0 ? 1 : -1;
494     atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
495   }
496 
497   return use > 0;
498 #endif
499 }
500 
501 
uv__iou_init(int epollfd,struct uv__iou * iou,uint32_t entries,uint32_t flags)502 static void uv__iou_init(int epollfd,
503                          struct uv__iou* iou,
504                          uint32_t entries,
505                          uint32_t flags) {
506   struct uv__io_uring_params params;
507   struct epoll_event e;
508   size_t cqlen;
509   size_t sqlen;
510   size_t maxlen;
511   size_t sqelen;
512   unsigned kernel_version;
513   uint32_t* sqarray;
514   uint32_t i;
515   char* sq;
516   char* sqe;
517   int ringfd;
518   int no_sqarray;
519 
520   sq = MAP_FAILED;
521   sqe = MAP_FAILED;
522 
523   if (!uv__use_io_uring(flags))
524     return;
525 
526   kernel_version = uv__kernel_version();
527   no_sqarray =
528       UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600);
529 
530   /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
531    * Mostly academic because we check for a v5.13 kernel afterwards anyway.
532    */
533   memset(&params, 0, sizeof(params));
534   params.flags = flags | no_sqarray;
535 
536   if (flags & UV__IORING_SETUP_SQPOLL)
537     params.sq_thread_idle = 10;  /* milliseconds */
538 
539   /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
540   ringfd = uv__io_uring_setup(entries, &params);
541   if (ringfd == -1)
542     return;
543 
544   /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
545    * actually detecting is whether IORING_OP_STATX works with SQPOLL.
546    */
547   if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
548     goto fail;
549 
550   /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
551   if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
552     goto fail;
553 
554   /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
555   if (!(params.features & UV__IORING_FEAT_NODROP))
556     goto fail;
557 
558   sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
559   cqlen =
560       params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
561   maxlen = sqlen < cqlen ? cqlen : sqlen;
562   sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
563 
564   sq = mmap(0,
565             maxlen,
566             PROT_READ | PROT_WRITE,
567             MAP_SHARED | MAP_POPULATE,
568             ringfd,
569             0);  /* IORING_OFF_SQ_RING */
570 
571   sqe = mmap(0,
572              sqelen,
573              PROT_READ | PROT_WRITE,
574              MAP_SHARED | MAP_POPULATE,
575              ringfd,
576              0x10000000ull);  /* IORING_OFF_SQES */
577 
578   if (sq == MAP_FAILED || sqe == MAP_FAILED)
579     goto fail;
580 
581   if (flags & UV__IORING_SETUP_SQPOLL) {
582     /* Only interested in completion events. To get notified when
583      * the kernel pulls items from the submission ring, add POLLOUT.
584      */
585     memset(&e, 0, sizeof(e));
586     e.events = POLLIN;
587     e.data.fd = ringfd;
588 
589     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
590       goto fail;
591   }
592 
593   iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
594   iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
595   iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
596   iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
597   iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
598   iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
599   iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
600   iou->sq = sq;
601   iou->cqe = sq + params.cq_off.cqes;
602   iou->sqe = sqe;
603   iou->sqlen = sqlen;
604   iou->cqlen = cqlen;
605   iou->maxlen = maxlen;
606   iou->sqelen = sqelen;
607   iou->ringfd = ringfd;
608   iou->in_flight = 0;
609 
610   if (no_sqarray)
611     return;
612 
613   sqarray = (uint32_t*) (sq + params.sq_off.array);
614   for (i = 0; i <= iou->sqmask; i++)
615     sqarray[i] = i;  /* Slot -> sqe identity mapping. */
616 
617   return;
618 
619 fail:
620   if (sq != MAP_FAILED)
621     munmap(sq, maxlen);
622 
623   if (sqe != MAP_FAILED)
624     munmap(sqe, sqelen);
625 
626   uv__close(ringfd);
627 }
628 
629 
uv__iou_delete(struct uv__iou * iou)630 static void uv__iou_delete(struct uv__iou* iou) {
631   if (iou->ringfd > -1) {
632     munmap(iou->sq, iou->maxlen);
633     munmap(iou->sqe, iou->sqelen);
634     uv__close(iou->ringfd);
635     iou->ringfd = -1;
636   }
637 }
638 
639 
uv__platform_loop_init(uv_loop_t * loop)640 int uv__platform_loop_init(uv_loop_t* loop) {
641   uv__loop_internal_fields_t* lfields;
642 
643   lfields = uv__get_internal_fields(loop);
644   lfields->ctl.ringfd = -1;
645   lfields->iou.ringfd = -2;  /* "uninitialized" */
646 
647   loop->inotify_watchers = NULL;
648   loop->inotify_fd = -1;
649   loop->backend_fd = epoll_create1(O_CLOEXEC);
650 
651   if (loop->backend_fd == -1)
652     return UV__ERR(errno);
653 
654   uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
655 
656   return 0;
657 }
658 
659 
uv__io_fork(uv_loop_t * loop)660 int uv__io_fork(uv_loop_t* loop) {
661   int err;
662   struct watcher_list* root;
663 
664   root = uv__inotify_watchers(loop)->rbh_root;
665 
666   uv__close(loop->backend_fd);
667   loop->backend_fd = -1;
668 
669   /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
670   uv__platform_loop_delete(loop);
671 
672   err = uv__platform_loop_init(loop);
673   if (err)
674     return err;
675 
676   return uv__inotify_fork(loop, root);
677 }
678 
679 
uv__platform_loop_delete(uv_loop_t * loop)680 void uv__platform_loop_delete(uv_loop_t* loop) {
681   uv__loop_internal_fields_t* lfields;
682 
683   lfields = uv__get_internal_fields(loop);
684   uv__iou_delete(&lfields->ctl);
685   uv__iou_delete(&lfields->iou);
686 
687   if (loop->inotify_fd != -1) {
688     uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
689     uv__close(loop->inotify_fd);
690     loop->inotify_fd = -1;
691   }
692 }
693 
694 
695 struct uv__invalidate {
696   struct epoll_event (*prep)[256];
697   struct epoll_event* events;
698   int nfds;
699 };
700 
701 
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)702 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
703   uv__loop_internal_fields_t* lfields;
704   struct uv__invalidate* inv;
705   struct epoll_event dummy;
706   int i;
707 
708   lfields = uv__get_internal_fields(loop);
709   inv = lfields->inv;
710 
711   /* Invalidate events with same file descriptor */
712   if (inv != NULL)
713     for (i = 0; i < inv->nfds; i++)
714       if (inv->events[i].data.fd == fd)
715         inv->events[i].data.fd = -1;
716 
717   /* Remove the file descriptor from the epoll.
718    * This avoids a problem where the same file description remains open
719    * in another process, causing repeated junk epoll events.
720    *
721    * Perform EPOLL_CTL_DEL immediately instead of going through
722    * io_uring's submit queue, otherwise the file descriptor may
723    * be closed by the time the kernel starts the operation.
724    *
725    * We pass in a dummy epoll_event, to work around a bug in old kernels.
726    *
727    * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
728    * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
729    */
730   memset(&dummy, 0, sizeof(dummy));
731   epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
732 }
733 
734 
uv__io_check_fd(uv_loop_t * loop,int fd)735 int uv__io_check_fd(uv_loop_t* loop, int fd) {
736   struct epoll_event e;
737   int rc;
738 
739   memset(&e, 0, sizeof(e));
740   e.events = POLLIN;
741   e.data.fd = -1;
742 
743   rc = 0;
744   if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
745     if (errno != EEXIST)
746       rc = UV__ERR(errno);
747 
748   if (rc == 0)
749     if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
750       abort();
751 
752   return rc;
753 }
754 
755 
756 /* Caller must initialize SQE and call uv__iou_submit(). */
uv__iou_get_sqe(struct uv__iou * iou,uv_loop_t * loop,uv_fs_t * req)757 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
758                                                 uv_loop_t* loop,
759                                                 uv_fs_t* req) {
760   struct uv__io_uring_sqe* sqe;
761   uint32_t head;
762   uint32_t tail;
763   uint32_t mask;
764   uint32_t slot;
765 
766   /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
767    * initialization failed. Anything else is a valid ring file descriptor.
768    */
769   if (iou->ringfd == -2) {
770     /* By default, the SQPOLL is not created. Enable only if the loop is
771      * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING
772      * environment variable is unset or a positive number.
773      */
774     if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL)
775       if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL))
776         uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
777 
778     if (iou->ringfd == -2)
779       iou->ringfd = -1;  /* "failed" */
780   }
781 
782   if (iou->ringfd == -1)
783     return NULL;
784 
785   head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
786                               memory_order_acquire);
787   tail = *iou->sqtail;
788   mask = iou->sqmask;
789 
790   if ((head & mask) == ((tail + 1) & mask))
791     return NULL;  /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
792 
793   slot = tail & mask;
794   sqe = iou->sqe;
795   sqe = &sqe[slot];
796   memset(sqe, 0, sizeof(*sqe));
797   sqe->user_data = (uintptr_t) req;
798 
799   /* Pacify uv_cancel(). */
800   req->work_req.loop = loop;
801   req->work_req.work = NULL;
802   req->work_req.done = NULL;
803   uv__queue_init(&req->work_req.wq);
804 
805   uv__req_register(loop);
806   iou->in_flight++;
807 
808   return sqe;
809 }
810 
811 
uv__iou_submit(struct uv__iou * iou)812 static void uv__iou_submit(struct uv__iou* iou) {
813   uint32_t flags;
814 
815   atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
816                         *iou->sqtail + 1,
817                         memory_order_release);
818 
819   flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
820                                memory_order_acquire);
821 
822   if (flags & UV__IORING_SQ_NEED_WAKEUP)
823     if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
824       if (errno != EOWNERDEAD)  /* Kernel bug. Harmless, ignore. */
825         perror("libuv: io_uring_enter(wakeup)");  /* Can't happen. */
826 }
827 
828 
uv__iou_fs_close(uv_loop_t * loop,uv_fs_t * req)829 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
830   struct uv__io_uring_sqe* sqe;
831   struct uv__iou* iou;
832   int kv;
833 
834   kv = uv__kernel_version();
835   /* Work around a poorly understood bug in older kernels where closing a file
836    * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
837    * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
838    * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
839    * but good candidates are the several data race fixes. Interestingly, it
840    * seems to manifest only when running under Docker so the possibility of
841    * a Docker bug can't be completely ruled out either. Yay, computers.
842    * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
843    * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
844    * solved.
845    */
846   if (kv < /* 5.15.90 */ 0x050F5A)
847     return 0;
848 
849   if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
850     return 0;
851 
852 
853   iou = &uv__get_internal_fields(loop)->iou;
854 
855   sqe = uv__iou_get_sqe(iou, loop, req);
856   if (sqe == NULL)
857     return 0;
858 
859   sqe->fd = req->file;
860   sqe->opcode = UV__IORING_OP_CLOSE;
861 
862   uv__iou_submit(iou);
863 
864   return 1;
865 }
866 
867 
uv__iou_fs_ftruncate(uv_loop_t * loop,uv_fs_t * req)868 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) {
869   struct uv__io_uring_sqe* sqe;
870   struct uv__iou* iou;
871 
872   if (uv__kernel_version() < /* 6.9 */0x060900)
873     return 0;
874 
875   iou = &uv__get_internal_fields(loop)->iou;
876   sqe = uv__iou_get_sqe(iou, loop, req);
877   if (sqe == NULL)
878     return 0;
879 
880   sqe->fd = req->file;
881   sqe->len = req->off;
882   sqe->opcode = UV__IORING_OP_FTRUNCATE;
883   uv__iou_submit(iou);
884 
885   return 1;
886 }
887 
uv__iou_fs_fsync_or_fdatasync(uv_loop_t * loop,uv_fs_t * req,uint32_t fsync_flags)888 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
889                                   uv_fs_t* req,
890                                   uint32_t fsync_flags) {
891   struct uv__io_uring_sqe* sqe;
892   struct uv__iou* iou;
893 
894   iou = &uv__get_internal_fields(loop)->iou;
895 
896   sqe = uv__iou_get_sqe(iou, loop, req);
897   if (sqe == NULL)
898     return 0;
899 
900   /* Little known fact: setting seq->off and seq->len turns
901    * it into an asynchronous sync_file_range() operation.
902    */
903   sqe->fd = req->file;
904   sqe->fsync_flags = fsync_flags;
905   sqe->opcode = UV__IORING_OP_FSYNC;
906 
907   uv__iou_submit(iou);
908 
909   return 1;
910 }
911 
912 
uv__iou_fs_link(uv_loop_t * loop,uv_fs_t * req)913 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
914   struct uv__io_uring_sqe* sqe;
915   struct uv__iou* iou;
916 
917   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
918     return 0;
919 
920   iou = &uv__get_internal_fields(loop)->iou;
921   sqe = uv__iou_get_sqe(iou, loop, req);
922   if (sqe == NULL)
923     return 0;
924 
925   sqe->addr = (uintptr_t) req->path;
926   sqe->fd = AT_FDCWD;
927   sqe->addr2 = (uintptr_t) req->new_path;
928   sqe->len = AT_FDCWD;
929   sqe->opcode = UV__IORING_OP_LINKAT;
930 
931   uv__iou_submit(iou);
932 
933   return 1;
934 }
935 
936 
uv__iou_fs_mkdir(uv_loop_t * loop,uv_fs_t * req)937 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
938   struct uv__io_uring_sqe* sqe;
939   struct uv__iou* iou;
940 
941   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
942     return 0;
943 
944   iou = &uv__get_internal_fields(loop)->iou;
945   sqe = uv__iou_get_sqe(iou, loop, req);
946   if (sqe == NULL)
947     return 0;
948 
949   sqe->addr = (uintptr_t) req->path;
950   sqe->fd = AT_FDCWD;
951   sqe->len = req->mode;
952   sqe->opcode = UV__IORING_OP_MKDIRAT;
953 
954   uv__iou_submit(iou);
955 
956   return 1;
957 }
958 
959 
uv__iou_fs_open(uv_loop_t * loop,uv_fs_t * req)960 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
961   struct uv__io_uring_sqe* sqe;
962   struct uv__iou* iou;
963 
964   iou = &uv__get_internal_fields(loop)->iou;
965 
966   sqe = uv__iou_get_sqe(iou, loop, req);
967   if (sqe == NULL)
968     return 0;
969 
970   sqe->addr = (uintptr_t) req->path;
971   sqe->fd = AT_FDCWD;
972   sqe->len = req->mode;
973   sqe->opcode = UV__IORING_OP_OPENAT;
974   sqe->open_flags = req->flags | O_CLOEXEC;
975 
976   uv__iou_submit(iou);
977 
978   return 1;
979 }
980 
981 
uv__iou_fs_rename(uv_loop_t * loop,uv_fs_t * req)982 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
983   struct uv__io_uring_sqe* sqe;
984   struct uv__iou* iou;
985 
986   iou = &uv__get_internal_fields(loop)->iou;
987 
988   sqe = uv__iou_get_sqe(iou, loop, req);
989   if (sqe == NULL)
990     return 0;
991 
992   sqe->addr = (uintptr_t) req->path;
993   sqe->fd = AT_FDCWD;
994   sqe->addr2 = (uintptr_t) req->new_path;
995   sqe->len = AT_FDCWD;
996   sqe->opcode = UV__IORING_OP_RENAMEAT;
997 
998   uv__iou_submit(iou);
999 
1000   return 1;
1001 }
1002 
1003 
uv__iou_fs_symlink(uv_loop_t * loop,uv_fs_t * req)1004 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
1005   struct uv__io_uring_sqe* sqe;
1006   struct uv__iou* iou;
1007 
1008   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
1009     return 0;
1010 
1011   iou = &uv__get_internal_fields(loop)->iou;
1012   sqe = uv__iou_get_sqe(iou, loop, req);
1013   if (sqe == NULL)
1014     return 0;
1015 
1016   sqe->addr = (uintptr_t) req->path;
1017   sqe->fd = AT_FDCWD;
1018   sqe->addr2 = (uintptr_t) req->new_path;
1019   sqe->opcode = UV__IORING_OP_SYMLINKAT;
1020 
1021   uv__iou_submit(iou);
1022 
1023   return 1;
1024 }
1025 
1026 
uv__iou_fs_unlink(uv_loop_t * loop,uv_fs_t * req)1027 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
1028   struct uv__io_uring_sqe* sqe;
1029   struct uv__iou* iou;
1030 
1031   iou = &uv__get_internal_fields(loop)->iou;
1032 
1033   sqe = uv__iou_get_sqe(iou, loop, req);
1034   if (sqe == NULL)
1035     return 0;
1036 
1037   sqe->addr = (uintptr_t) req->path;
1038   sqe->fd = AT_FDCWD;
1039   sqe->opcode = UV__IORING_OP_UNLINKAT;
1040 
1041   uv__iou_submit(iou);
1042 
1043   return 1;
1044 }
1045 
1046 
uv__iou_fs_read_or_write(uv_loop_t * loop,uv_fs_t * req,int is_read)1047 int uv__iou_fs_read_or_write(uv_loop_t* loop,
1048                              uv_fs_t* req,
1049                              int is_read) {
1050   struct uv__io_uring_sqe* sqe;
1051   struct uv__iou* iou;
1052 
1053   /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1054    * to the threadpool on writes */
1055   if (req->nbufs > IOV_MAX) {
1056     if (is_read)
1057       req->nbufs = IOV_MAX;
1058     else
1059       return 0;
1060   }
1061 
1062   iou = &uv__get_internal_fields(loop)->iou;
1063 
1064   sqe = uv__iou_get_sqe(iou, loop, req);
1065   if (sqe == NULL)
1066     return 0;
1067 
1068   sqe->addr = (uintptr_t) req->bufs;
1069   sqe->fd = req->file;
1070   sqe->len = req->nbufs;
1071   sqe->off = req->off < 0 ? -1 : req->off;
1072   sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1073 
1074   uv__iou_submit(iou);
1075 
1076   return 1;
1077 }
1078 
1079 
uv__iou_fs_statx(uv_loop_t * loop,uv_fs_t * req,int is_fstat,int is_lstat)1080 int uv__iou_fs_statx(uv_loop_t* loop,
1081                      uv_fs_t* req,
1082                      int is_fstat,
1083                      int is_lstat) {
1084   struct uv__io_uring_sqe* sqe;
1085   struct uv__statx* statxbuf;
1086   struct uv__iou* iou;
1087 
1088   statxbuf = uv__malloc(sizeof(*statxbuf));
1089   if (statxbuf == NULL)
1090     return 0;
1091 
1092   iou = &uv__get_internal_fields(loop)->iou;
1093 
1094   sqe = uv__iou_get_sqe(iou, loop, req);
1095   if (sqe == NULL) {
1096     uv__free(statxbuf);
1097     return 0;
1098   }
1099 
1100   req->ptr = statxbuf;
1101 
1102   sqe->addr = (uintptr_t) req->path;
1103   sqe->addr2 = (uintptr_t) statxbuf;
1104   sqe->fd = AT_FDCWD;
1105   sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1106   sqe->opcode = UV__IORING_OP_STATX;
1107 
1108   if (is_fstat) {
1109     sqe->addr = (uintptr_t) "";
1110     sqe->fd = req->file;
1111     sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1112   }
1113 
1114   if (is_lstat)
1115     sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1116 
1117   uv__iou_submit(iou);
1118 
1119   return 1;
1120 }
1121 
1122 
uv__statx_to_stat(const struct uv__statx * statxbuf,uv_stat_t * buf)1123 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1124   buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1125   buf->st_mode = statxbuf->stx_mode;
1126   buf->st_nlink = statxbuf->stx_nlink;
1127   buf->st_uid = statxbuf->stx_uid;
1128   buf->st_gid = statxbuf->stx_gid;
1129   buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1130   buf->st_ino = statxbuf->stx_ino;
1131   buf->st_size = statxbuf->stx_size;
1132   buf->st_blksize = statxbuf->stx_blksize;
1133   buf->st_blocks = statxbuf->stx_blocks;
1134   buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1135   buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1136   buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1137   buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1138   buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1139   buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1140   buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1141   buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1142   buf->st_flags = 0;
1143   buf->st_gen = 0;
1144 }
1145 
1146 
uv__iou_fs_statx_post(uv_fs_t * req)1147 static void uv__iou_fs_statx_post(uv_fs_t* req) {
1148   struct uv__statx* statxbuf;
1149   uv_stat_t* buf;
1150 
1151   buf = &req->statbuf;
1152   statxbuf = req->ptr;
1153   req->ptr = NULL;
1154 
1155   if (req->result == 0) {
1156     uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1157     uv__statx_to_stat(statxbuf, buf);
1158     req->ptr = buf;
1159   }
1160 
1161   uv__free(statxbuf);
1162 }
1163 
1164 
uv__poll_io_uring(uv_loop_t * loop,struct uv__iou * iou)1165 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1166   struct uv__io_uring_cqe* cqe;
1167   struct uv__io_uring_cqe* e;
1168   uv_fs_t* req;
1169   uint32_t head;
1170   uint32_t tail;
1171   uint32_t mask;
1172   uint32_t i;
1173   uint32_t flags;
1174   int nevents;
1175   int rc;
1176 
1177   head = *iou->cqhead;
1178   tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1179                               memory_order_acquire);
1180   mask = iou->cqmask;
1181   cqe = iou->cqe;
1182   nevents = 0;
1183 
1184   for (i = head; i != tail; i++) {
1185     e = &cqe[i & mask];
1186 
1187     req = (uv_fs_t*) (uintptr_t) e->user_data;
1188     assert(req->type == UV_FS);
1189 
1190     uv__req_unregister(loop);
1191     iou->in_flight--;
1192 
1193     /* If the op is not supported by the kernel retry using the thread pool */
1194     if (e->res == -EOPNOTSUPP) {
1195       uv__fs_post(loop, req);
1196       continue;
1197     }
1198 
1199     /* io_uring stores error codes as negative numbers, same as libuv. */
1200     req->result = e->res;
1201 
1202     switch (req->fs_type) {
1203       case UV_FS_FSTAT:
1204       case UV_FS_LSTAT:
1205       case UV_FS_STAT:
1206         uv__iou_fs_statx_post(req);
1207         break;
1208       default:  /* Squelch -Wswitch warnings. */
1209         break;
1210     }
1211 
1212     uv__metrics_update_idle_time(loop);
1213     req->cb(req);
1214     nevents++;
1215   }
1216 
1217   atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1218                         tail,
1219                         memory_order_release);
1220 
1221   /* Check whether CQE's overflowed, if so enter the kernel to make them
1222    * available. Don't grab them immediately but in the next loop iteration to
1223    * avoid loop starvation. */
1224   flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1225                                memory_order_acquire);
1226 
1227   if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1228     do
1229       rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1230     while (rc == -1 && errno == EINTR);
1231 
1232     if (rc < 0)
1233       perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
1234   }
1235 
1236   uv__metrics_inc_events(loop, nevents);
1237   if (uv__get_internal_fields(loop)->current_timeout == 0)
1238     uv__metrics_inc_events_waiting(loop, nevents);
1239 }
1240 
1241 
1242 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1243  * executed immediately, otherwise the file descriptor may have been closed
1244  * by the time the kernel starts the operation.
1245  */
uv__epoll_ctl_prep(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256],int op,int fd,struct epoll_event * e)1246 static void uv__epoll_ctl_prep(int epollfd,
1247                                struct uv__iou* ctl,
1248                                struct epoll_event (*events)[256],
1249                                int op,
1250                                int fd,
1251                                struct epoll_event* e) {
1252   struct uv__io_uring_sqe* sqe;
1253   struct epoll_event* pe;
1254   uint32_t mask;
1255   uint32_t slot;
1256 
1257   assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1258   assert(ctl->ringfd != -1);
1259 
1260   mask = ctl->sqmask;
1261   slot = (*ctl->sqtail)++ & mask;
1262 
1263   pe = &(*events)[slot];
1264   *pe = *e;
1265 
1266   sqe = ctl->sqe;
1267   sqe = &sqe[slot];
1268 
1269   memset(sqe, 0, sizeof(*sqe));
1270   sqe->addr = (uintptr_t) pe;
1271   sqe->fd = epollfd;
1272   sqe->len = op;
1273   sqe->off = fd;
1274   sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1275   sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1276 
1277   if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1278     uv__epoll_ctl_flush(epollfd, ctl, events);
1279 }
1280 
1281 
uv__epoll_ctl_flush(int epollfd,struct uv__iou * ctl,struct epoll_event (* events)[256])1282 static void uv__epoll_ctl_flush(int epollfd,
1283                                 struct uv__iou* ctl,
1284                                 struct epoll_event (*events)[256]) {
1285   struct epoll_event oldevents[256];
1286   struct uv__io_uring_cqe* cqe;
1287   uint32_t oldslot;
1288   uint32_t slot;
1289   uint32_t n;
1290   int fd;
1291   int op;
1292   int rc;
1293 
1294   STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1295   assert(ctl->ringfd != -1);
1296   assert(*ctl->sqhead != *ctl->sqtail);
1297 
1298   n = *ctl->sqtail - *ctl->sqhead;
1299   do
1300     rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1301   while (rc == -1 && errno == EINTR);
1302 
1303   if (rc < 0)
1304     perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
1305 
1306   if (rc != (int) n)
1307     abort();
1308 
1309   assert(*ctl->sqhead == *ctl->sqtail);
1310 
1311   memcpy(oldevents, *events, sizeof(*events));
1312 
1313   /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1314    * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1315    * that we are already watching. Ignore the former and retry the latter
1316    * with EPOLL_CTL_MOD.
1317    */
1318   while (*ctl->cqhead != *ctl->cqtail) {
1319     slot = (*ctl->cqhead)++ & ctl->cqmask;
1320 
1321     cqe = ctl->cqe;
1322     cqe = &cqe[slot];
1323 
1324     if (cqe->res == 0)
1325       continue;
1326 
1327     fd = cqe->user_data >> 32;
1328     op = 3 & cqe->user_data;
1329     oldslot = 255 & (cqe->user_data >> 2);
1330 
1331     if (op == EPOLL_CTL_DEL)
1332       continue;
1333 
1334     if (op != EPOLL_CTL_ADD)
1335       abort();
1336 
1337     if (cqe->res != -EEXIST)
1338       abort();
1339 
1340     uv__epoll_ctl_prep(epollfd,
1341                        ctl,
1342                        events,
1343                        EPOLL_CTL_MOD,
1344                        fd,
1345                        &oldevents[oldslot]);
1346   }
1347 }
1348 
1349 
uv__io_poll(uv_loop_t * loop,int timeout)1350 void uv__io_poll(uv_loop_t* loop, int timeout) {
1351   uv__loop_internal_fields_t* lfields;
1352   struct epoll_event events[1024];
1353   struct epoll_event prep[256];
1354   struct uv__invalidate inv;
1355   struct epoll_event* pe;
1356   struct epoll_event e;
1357   struct uv__iou* ctl;
1358   struct uv__iou* iou;
1359   int real_timeout;
1360   struct uv__queue* q;
1361   uv__io_t* w;
1362   sigset_t* sigmask;
1363   sigset_t sigset;
1364   uint64_t base;
1365   int have_iou_events;
1366   int have_signals;
1367   int nevents;
1368   int epollfd;
1369   int count;
1370   int nfds;
1371   int fd;
1372   int op;
1373   int i;
1374   int user_timeout;
1375   int reset_timeout;
1376 
1377   lfields = uv__get_internal_fields(loop);
1378   ctl = &lfields->ctl;
1379   iou = &lfields->iou;
1380 
1381   sigmask = NULL;
1382   if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1383     sigemptyset(&sigset);
1384     sigaddset(&sigset, SIGPROF);
1385     sigmask = &sigset;
1386   }
1387 
1388   assert(timeout >= -1);
1389   base = loop->time;
1390   count = 48; /* Benchmarks suggest this gives the best throughput. */
1391   real_timeout = timeout;
1392 
1393   if (lfields->flags & UV_METRICS_IDLE_TIME) {
1394     reset_timeout = 1;
1395     user_timeout = timeout;
1396     timeout = 0;
1397   } else {
1398     reset_timeout = 0;
1399     user_timeout = 0;
1400   }
1401 
1402   epollfd = loop->backend_fd;
1403 
1404   memset(&e, 0, sizeof(e));
1405 
1406   while (!uv__queue_empty(&loop->watcher_queue)) {
1407     q = uv__queue_head(&loop->watcher_queue);
1408     w = uv__queue_data(q, uv__io_t, watcher_queue);
1409     uv__queue_remove(q);
1410     uv__queue_init(q);
1411 
1412     op = EPOLL_CTL_MOD;
1413     if (w->events == 0)
1414       op = EPOLL_CTL_ADD;
1415 
1416     w->events = w->pevents;
1417     e.events = w->pevents;
1418     e.data.fd = w->fd;
1419     fd = w->fd;
1420 
1421     if (ctl->ringfd != -1) {
1422       uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1423       continue;
1424     }
1425 
1426     if (!epoll_ctl(epollfd, op, fd, &e))
1427       continue;
1428 
1429     assert(op == EPOLL_CTL_ADD);
1430     assert(errno == EEXIST);
1431 
1432     /* File descriptor that's been watched before, update event mask. */
1433     if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1434       abort();
1435   }
1436 
1437   inv.events = events;
1438   inv.prep = &prep;
1439   inv.nfds = -1;
1440 
1441   for (;;) {
1442     if (loop->nfds == 0)
1443       if (iou->in_flight == 0)
1444         break;
1445 
1446     /* All event mask mutations should be visible to the kernel before
1447      * we enter epoll_pwait().
1448      */
1449     if (ctl->ringfd != -1)
1450       while (*ctl->sqhead != *ctl->sqtail)
1451         uv__epoll_ctl_flush(epollfd, ctl, &prep);
1452 
1453     /* Only need to set the provider_entry_time if timeout != 0. The function
1454      * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
1455      */
1456     if (timeout != 0)
1457       uv__metrics_set_provider_entry_time(loop);
1458 
1459     /* Store the current timeout in a location that's globally accessible so
1460      * other locations like uv__work_done() can determine whether the queue
1461      * of events in the callback were waiting when poll was called.
1462      */
1463     lfields->current_timeout = timeout;
1464 
1465     nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1466 
1467     /* Update loop->time unconditionally. It's tempting to skip the update when
1468      * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
1469      * operating system didn't reschedule our process while in the syscall.
1470      */
1471     SAVE_ERRNO(uv__update_time(loop));
1472 
1473     if (nfds == -1)
1474       assert(errno == EINTR);
1475     else if (nfds == 0)
1476       /* Unlimited timeout should only return with events or signal. */
1477       assert(timeout != -1);
1478 
1479     if (nfds == 0 || nfds == -1) {
1480       if (reset_timeout != 0) {
1481         timeout = user_timeout;
1482         reset_timeout = 0;
1483       } else if (nfds == 0) {
1484         return;
1485       }
1486 
1487       /* Interrupted by a signal. Update timeout and poll again. */
1488       goto update_timeout;
1489     }
1490 
1491     have_iou_events = 0;
1492     have_signals = 0;
1493     nevents = 0;
1494 
1495     inv.nfds = nfds;
1496     lfields->inv = &inv;
1497 
1498     for (i = 0; i < nfds; i++) {
1499       pe = events + i;
1500       fd = pe->data.fd;
1501 
1502       /* Skip invalidated events, see uv__platform_invalidate_fd */
1503       if (fd == -1)
1504         continue;
1505 
1506       if (fd == iou->ringfd) {
1507         uv__poll_io_uring(loop, iou);
1508         have_iou_events = 1;
1509         continue;
1510       }
1511 
1512       assert(fd >= 0);
1513       assert((unsigned) fd < loop->nwatchers);
1514 
1515       w = loop->watchers[fd];
1516 
1517       if (w == NULL) {
1518         /* File descriptor that we've stopped watching, disarm it.
1519          *
1520          * Ignore all errors because we may be racing with another thread
1521          * when the file descriptor is closed.
1522          *
1523          * Perform EPOLL_CTL_DEL immediately instead of going through
1524          * io_uring's submit queue, otherwise the file descriptor may
1525          * be closed by the time the kernel starts the operation.
1526          */
1527         epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1528         continue;
1529       }
1530 
1531       /* Give users only events they're interested in. Prevents spurious
1532        * callbacks when previous callback invocation in this loop has stopped
1533        * the current watcher. Also, filters out events that users has not
1534        * requested us to watch.
1535        */
1536       pe->events &= w->pevents | POLLERR | POLLHUP;
1537 
1538       /* Work around an epoll quirk where it sometimes reports just the
1539        * EPOLLERR or EPOLLHUP event.  In order to force the event loop to
1540        * move forward, we merge in the read/write events that the watcher
1541        * is interested in; uv__read() and uv__write() will then deal with
1542        * the error or hangup in the usual fashion.
1543        *
1544        * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1545        * reads the available data, calls uv_read_stop(), then sometime later
1546        * calls uv_read_start() again.  By then, libuv has forgotten about the
1547        * hangup and the kernel won't report EPOLLIN again because there's
1548        * nothing left to read.  If anything, libuv is to blame here.  The
1549        * current hack is just a quick bandaid; to properly fix it, libuv
1550        * needs to remember the error/hangup event.  We should get that for
1551        * free when we switch over to edge-triggered I/O.
1552        */
1553       if (pe->events == POLLERR || pe->events == POLLHUP)
1554         pe->events |=
1555           w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1556 
1557       if (pe->events != 0) {
1558         /* Run signal watchers last.  This also affects child process watchers
1559          * because those are implemented in terms of signal watchers.
1560          */
1561         if (w == &loop->signal_io_watcher) {
1562           have_signals = 1;
1563         } else {
1564           uv__metrics_update_idle_time(loop);
1565           w->cb(loop, w, pe->events);
1566         }
1567 
1568         nevents++;
1569       }
1570     }
1571 
1572     uv__metrics_inc_events(loop, nevents);
1573     if (reset_timeout != 0) {
1574       timeout = user_timeout;
1575       reset_timeout = 0;
1576       uv__metrics_inc_events_waiting(loop, nevents);
1577     }
1578 
1579     if (have_signals != 0) {
1580       uv__metrics_update_idle_time(loop);
1581       loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
1582     }
1583 
1584     lfields->inv = NULL;
1585 
1586     if (have_iou_events != 0)
1587       break;  /* Event loop should cycle now so don't poll again. */
1588 
1589     if (have_signals != 0)
1590       break;  /* Event loop should cycle now so don't poll again. */
1591 
1592     if (nevents != 0) {
1593       if (nfds == ARRAY_SIZE(events) && --count != 0) {
1594         /* Poll for more events but don't block this time. */
1595         timeout = 0;
1596         continue;
1597       }
1598       break;
1599     }
1600 
1601 update_timeout:
1602     if (timeout == 0)
1603       break;
1604 
1605     if (timeout == -1)
1606       continue;
1607 
1608     assert(timeout > 0);
1609 
1610     real_timeout -= (loop->time - base);
1611     if (real_timeout <= 0)
1612       break;
1613 
1614     timeout = real_timeout;
1615   }
1616 
1617   if (ctl->ringfd != -1)
1618     while (*ctl->sqhead != *ctl->sqtail)
1619       uv__epoll_ctl_flush(epollfd, ctl, &prep);
1620 }
1621 
uv__hrtime(uv_clocktype_t type)1622 uint64_t uv__hrtime(uv_clocktype_t type) {
1623   static _Atomic clock_t fast_clock_id = -1;
1624   struct timespec t;
1625   clock_t clock_id;
1626 
1627   /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1628    * millisecond granularity or better.  CLOCK_MONOTONIC_COARSE is
1629    * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1630    * decide to make a costly system call.
1631    */
1632   /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1633    * when it has microsecond granularity or better (unlikely).
1634    */
1635   clock_id = CLOCK_MONOTONIC;
1636   if (type != UV_CLOCK_FAST)
1637     goto done;
1638 
1639   clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1640   if (clock_id != -1)
1641     goto done;
1642 
1643   clock_id = CLOCK_MONOTONIC;
1644   if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1645     if (t.tv_nsec <= 1 * 1000 * 1000)
1646       clock_id = CLOCK_MONOTONIC_COARSE;
1647 
1648   atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1649 
1650 done:
1651 
1652   if (clock_gettime(clock_id, &t))
1653     return 0;  /* Not really possible. */
1654 
1655   return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1656 }
1657 
1658 
uv_resident_set_memory(size_t * rss)1659 int uv_resident_set_memory(size_t* rss) {
1660   char buf[1024];
1661   const char* s;
1662   long val;
1663   int rc;
1664   int i;
1665 
1666   /* rss: 24th element */
1667   rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1668   if (rc < 0)
1669     return rc;
1670 
1671   /* find the last ')' */
1672   s = strrchr(buf, ')');
1673   if (s == NULL)
1674     goto err;
1675 
1676   for (i = 1; i <= 22; i++) {
1677     s = strchr(s + 1, ' ');
1678     if (s == NULL)
1679       goto err;
1680   }
1681 
1682   errno = 0;
1683   val = strtol(s, NULL, 10);
1684   if (val < 0 || errno != 0)
1685     goto err;
1686 
1687   *rss = val * getpagesize();
1688   return 0;
1689 
1690 err:
1691   return UV_EINVAL;
1692 }
1693 
uv_uptime(double * uptime)1694 int uv_uptime(double* uptime) {
1695   struct timespec now;
1696   char buf[128];
1697 
1698   /* Consult /proc/uptime when present (common case), or fall back to
1699    * clock_gettime. Why not always clock_gettime? It doesn't always return the
1700    * right result under OpenVZ and possibly other containerized environments.
1701    */
1702   if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1703     if (1 == sscanf(buf, "%lf", uptime))
1704       return 0;
1705 
1706   if (clock_gettime(CLOCK_BOOTTIME, &now))
1707     return UV__ERR(errno);
1708 
1709   *uptime = now.tv_sec;
1710   return 0;
1711 }
1712 
1713 
uv_cpu_info(uv_cpu_info_t ** ci,int * count)1714 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1715 #if defined(__PPC__)
1716   static const char model_marker[] = "cpu\t\t: ";
1717   static const char model_marker2[] = "";
1718 #elif defined(__arm__)
1719   static const char model_marker[] = "model name\t: ";
1720   static const char model_marker2[] = "Processor\t: ";
1721 #elif defined(__aarch64__)
1722   static const char model_marker[] = "CPU part\t: ";
1723   static const char model_marker2[] = "";
1724 #elif defined(__mips__)
1725   static const char model_marker[] = "cpu model\t\t: ";
1726   static const char model_marker2[] = "";
1727 #elif defined(__loongarch__)
1728   static const char model_marker[] = "cpu family\t\t: ";
1729   static const char model_marker2[] = "";
1730 #else
1731   static const char model_marker[] = "model name\t: ";
1732   static const char model_marker2[] = "";
1733 #endif
1734   static const char parts[] =
1735 #ifdef __aarch64__
1736     "0x811\nARM810\n"       "0x920\nARM920\n"      "0x922\nARM922\n"
1737     "0x926\nARM926\n"       "0x940\nARM940\n"      "0x946\nARM946\n"
1738     "0x966\nARM966\n"       "0xa20\nARM1020\n"      "0xa22\nARM1022\n"
1739     "0xa26\nARM1026\n"      "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1740     "0xb56\nARM1156\n"      "0xb76\nARM1176\n"      "0xc05\nCortex-A5\n"
1741     "0xc07\nCortex-A7\n"    "0xc08\nCortex-A8\n"    "0xc09\nCortex-A9\n"
1742     "0xc0d\nCortex-A17\n"   /* Originally A12 */
1743     "0xc0f\nCortex-A15\n"   "0xc0e\nCortex-A17\n"   "0xc14\nCortex-R4\n"
1744     "0xc15\nCortex-R5\n"    "0xc17\nCortex-R7\n"    "0xc18\nCortex-R8\n"
1745     "0xc20\nCortex-M0\n"    "0xc21\nCortex-M1\n"    "0xc23\nCortex-M3\n"
1746     "0xc24\nCortex-M4\n"    "0xc27\nCortex-M7\n"    "0xc60\nCortex-M0+\n"
1747     "0xd01\nCortex-A32\n"   "0xd03\nCortex-A53\n"   "0xd04\nCortex-A35\n"
1748     "0xd05\nCortex-A55\n"   "0xd06\nCortex-A65\n"   "0xd07\nCortex-A57\n"
1749     "0xd08\nCortex-A72\n"   "0xd09\nCortex-A73\n"   "0xd0a\nCortex-A75\n"
1750     "0xd0b\nCortex-A76\n"   "0xd0c\nNeoverse-N1\n"  "0xd0d\nCortex-A77\n"
1751     "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n"   "0xd20\nCortex-M23\n"
1752     "0xd21\nCortex-M33\n"   "0xd41\nCortex-A78\n"   "0xd42\nCortex-A78AE\n"
1753     "0xd4a\nNeoverse-E1\n"  "0xd4b\nCortex-A78C\n"
1754 #endif
1755     "";
1756   struct cpu {
1757     unsigned long long freq, user, nice, sys, idle, irq;
1758     unsigned model;
1759   };
1760   FILE* fp;
1761   char* p;
1762   int found;
1763   int n;
1764   unsigned i;
1765   unsigned cpu;
1766   unsigned maxcpu;
1767   unsigned size;
1768   unsigned long long skip;
1769   struct cpu (*cpus)[8192];  /* Kernel maximum. */
1770   struct cpu* c;
1771   struct cpu t;
1772   char (*model)[64];
1773   unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1774   /* Assumption: even big.LITTLE systems will have only a handful
1775    * of different CPU models. Most systems will just have one.
1776    */
1777   char models[8][64];
1778   char buf[1024];
1779 
1780   memset(bitmap, 0, sizeof(bitmap));
1781   memset(models, 0, sizeof(models));
1782   snprintf(*models, sizeof(*models), "unknown");
1783   maxcpu = 0;
1784 
1785   cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1786   if (cpus == NULL)
1787     return UV_ENOMEM;
1788 
1789   fp = uv__open_file("/proc/stat");
1790   if (fp == NULL) {
1791     uv__free(cpus);
1792     return UV__ERR(errno);
1793   }
1794 
1795   if (NULL == fgets(buf, sizeof(buf), fp))
1796     abort();
1797 
1798   for (;;) {
1799     memset(&t, 0, sizeof(t));
1800 
1801     n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1802                &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1803 
1804     if (n != 7)
1805       break;
1806 
1807     if (NULL == fgets(buf, sizeof(buf), fp))
1808       abort();
1809 
1810     if (cpu >= ARRAY_SIZE(*cpus))
1811       continue;
1812 
1813     (*cpus)[cpu] = t;
1814 
1815     bitmap[cpu >> 3] |= 1 << (cpu & 7);
1816 
1817     if (cpu >= maxcpu)
1818       maxcpu = cpu + 1;
1819   }
1820 
1821   fclose(fp);
1822 
1823   fp = uv__open_file("/proc/cpuinfo");
1824   if (fp == NULL)
1825     goto nocpuinfo;
1826 
1827   for (;;) {
1828     if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1829       break;  /* Parse error. */
1830 
1831     while (fgets(buf, sizeof(buf), fp)) {
1832       if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) {
1833         p = buf + sizeof(model_marker) - 1;
1834         goto parts;
1835       }
1836       if (!*model_marker2)
1837         continue;
1838       if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) {
1839         p = buf + sizeof(model_marker2) - 1;
1840         goto parts;
1841       }
1842     }
1843 
1844     goto next;  /* Not found. */
1845 
1846 parts:
1847     n = (int) strcspn(p, "\n");
1848 
1849     /* arm64: translate CPU part code to model name. */
1850     if (*parts) {
1851       p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1852       if (p == NULL)
1853         p = "unknown";
1854       else
1855         p += n + 1;
1856       n = (int) strcspn(p, "\n");
1857     }
1858 
1859     found = 0;
1860     for (model = models; !found && model < ARRAY_END(models); model++)
1861       found = !strncmp(p, *model, strlen(*model));
1862 
1863     if (!found)
1864       goto next;
1865 
1866     if (**model == '\0')
1867       snprintf(*model, sizeof(*model), "%.*s", n, p);
1868 
1869     if (cpu < maxcpu)
1870       (*cpus)[cpu].model = model - models;
1871 
1872 next:
1873     while (fgets(buf, sizeof(buf), fp))
1874       if (*buf == '\n')
1875         break;
1876   }
1877 
1878   fclose(fp);
1879   fp = NULL;
1880 
1881 nocpuinfo:
1882 
1883   n = 0;
1884   for (cpu = 0; cpu < maxcpu; cpu++) {
1885     if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1886       continue;
1887 
1888     n++;
1889     snprintf(buf, sizeof(buf),
1890              "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1891 
1892     fp = uv__open_file(buf);
1893     if (fp == NULL)
1894       continue;
1895 
1896     if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1897       abort();
1898     fclose(fp);
1899     fp = NULL;
1900   }
1901 
1902   size = n * sizeof(**ci) + sizeof(models);
1903   *ci = uv__malloc(size);
1904   *count = 0;
1905 
1906   if (*ci == NULL) {
1907     uv__free(cpus);
1908     return UV_ENOMEM;
1909   }
1910 
1911   *count = n;
1912   p = memcpy(*ci + n, models, sizeof(models));
1913 
1914   i = 0;
1915   for (cpu = 0; cpu < maxcpu; cpu++) {
1916     if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1917       continue;
1918 
1919     c = *cpus + cpu;
1920 
1921     (*ci)[i++] = (uv_cpu_info_t) {
1922       .model     = p + c->model * sizeof(*model),
1923       .speed     = c->freq / 1000,
1924       /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1925        * therefore the multiplier is always 1000/100 = 10.
1926        */
1927       .cpu_times = (struct uv_cpu_times_s) {
1928         .user = 10 * c->user,
1929         .nice = 10 * c->nice,
1930         .sys  = 10 * c->sys,
1931         .idle = 10 * c->idle,
1932         .irq  = 10 * c->irq,
1933       },
1934     };
1935   }
1936 
1937   uv__free(cpus);
1938 
1939   return 0;
1940 }
1941 
1942 
uv__ifaddr_exclude(struct ifaddrs * ent,int exclude_type)1943 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1944   if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1945     return 1;
1946   if (ent->ifa_addr == NULL)
1947     return 1;
1948   /*
1949    * On Linux getifaddrs returns information related to the raw underlying
1950    * devices. We're not interested in this information yet.
1951    */
1952   if (ent->ifa_addr->sa_family == PF_PACKET)
1953     return exclude_type;
1954   return !exclude_type;
1955 }
1956 
uv_interface_addresses(uv_interface_address_t ** addresses,int * count)1957 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1958   struct ifaddrs *addrs, *ent;
1959   uv_interface_address_t* address;
1960   int i;
1961   struct sockaddr_ll *sll;
1962 
1963   *count = 0;
1964   *addresses = NULL;
1965 
1966   if (getifaddrs(&addrs))
1967     return UV__ERR(errno);
1968 
1969   /* Count the number of interfaces */
1970   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1971     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1972       continue;
1973 
1974     (*count)++;
1975   }
1976 
1977   if (*count == 0) {
1978     freeifaddrs(addrs);
1979     return 0;
1980   }
1981 
1982   /* Make sure the memory is initiallized to zero using calloc() */
1983   *addresses = uv__calloc(*count, sizeof(**addresses));
1984   if (!(*addresses)) {
1985     freeifaddrs(addrs);
1986     return UV_ENOMEM;
1987   }
1988 
1989   address = *addresses;
1990 
1991   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1992     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1993       continue;
1994 
1995     address->name = uv__strdup(ent->ifa_name);
1996 
1997     if (ent->ifa_addr->sa_family == AF_INET6) {
1998       address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
1999     } else {
2000       address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
2001     }
2002 
2003     if (ent->ifa_netmask->sa_family == AF_INET6) {
2004       address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
2005     } else {
2006       address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
2007     }
2008 
2009     address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
2010 
2011     address++;
2012   }
2013 
2014   /* Fill in physical addresses for each interface */
2015   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
2016     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
2017       continue;
2018 
2019     address = *addresses;
2020 
2021     for (i = 0; i < (*count); i++) {
2022       size_t namelen = strlen(ent->ifa_name);
2023       /* Alias interface share the same physical address */
2024       if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
2025           (address->name[namelen] == 0 || address->name[namelen] == ':')) {
2026         sll = (struct sockaddr_ll*)ent->ifa_addr;
2027         memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
2028       }
2029       address++;
2030     }
2031   }
2032 
2033   freeifaddrs(addrs);
2034 
2035   return 0;
2036 }
2037 
2038 
uv_free_interface_addresses(uv_interface_address_t * addresses,int count)2039 void uv_free_interface_addresses(uv_interface_address_t* addresses,
2040   int count) {
2041   int i;
2042 
2043   for (i = 0; i < count; i++) {
2044     uv__free(addresses[i].name);
2045   }
2046 
2047   uv__free(addresses);
2048 }
2049 
2050 
uv__set_process_title(const char * title)2051 void uv__set_process_title(const char* title) {
2052 #if defined(PR_SET_NAME)
2053   prctl(PR_SET_NAME, title);  /* Only copies first 16 characters. */
2054 #endif
2055 }
2056 
2057 
uv__read_proc_meminfo(const char * what)2058 static uint64_t uv__read_proc_meminfo(const char* what) {
2059   uint64_t rc;
2060   char* p;
2061   char buf[4096];  /* Large enough to hold all of /proc/meminfo. */
2062 
2063   if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2064     return 0;
2065 
2066   p = strstr(buf, what);
2067 
2068   if (p == NULL)
2069     return 0;
2070 
2071   p += strlen(what);
2072 
2073   rc = 0;
2074   sscanf(p, "%" PRIu64 " kB", &rc);
2075 
2076   return rc * 1024;
2077 }
2078 
2079 
uv_get_free_memory(void)2080 uint64_t uv_get_free_memory(void) {
2081   struct sysinfo info;
2082   uint64_t rc;
2083 
2084   rc = uv__read_proc_meminfo("MemAvailable:");
2085 
2086   if (rc != 0)
2087     return rc;
2088 
2089   if (0 == sysinfo(&info))
2090     return (uint64_t) info.freeram * info.mem_unit;
2091 
2092   return 0;
2093 }
2094 
2095 
uv_get_total_memory(void)2096 uint64_t uv_get_total_memory(void) {
2097   struct sysinfo info;
2098   uint64_t rc;
2099 
2100   rc = uv__read_proc_meminfo("MemTotal:");
2101 
2102   if (rc != 0)
2103     return rc;
2104 
2105   if (0 == sysinfo(&info))
2106     return (uint64_t) info.totalram * info.mem_unit;
2107 
2108   return 0;
2109 }
2110 
2111 
uv__read_uint64(const char * filename)2112 static uint64_t uv__read_uint64(const char* filename) {
2113   char buf[32];  /* Large enough to hold an encoded uint64_t. */
2114   uint64_t rc;
2115 
2116   rc = 0;
2117   if (0 == uv__slurp(filename, buf, sizeof(buf)))
2118     if (1 != sscanf(buf, "%" PRIu64, &rc))
2119       if (0 == strcmp(buf, "max\n"))
2120         rc = UINT64_MAX;
2121 
2122   return rc;
2123 }
2124 
2125 
2126 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2127  * finds the location and length of the memory controller mount path.
2128  * This disregards the leading / for easy concatenation of paths.
2129  * Returns NULL if the memory controller wasn't found. */
uv__cgroup1_find_memory_controller(char buf[static1024],int * n)2130 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2131                                                 int* n) {
2132   char* p;
2133 
2134   /* Seek to the memory controller line. */
2135   p = strchr(buf, ':');
2136   while (p != NULL && strncmp(p, ":memory:", 8)) {
2137     p = strchr(p, '\n');
2138     if (p != NULL)
2139       p = strchr(p, ':');
2140   }
2141 
2142   if (p != NULL) {
2143     /* Determine the length of the mount path. */
2144     p = p + strlen(":memory:/");
2145     *n = (int) strcspn(p, "\n");
2146   }
2147 
2148   return p;
2149 }
2150 
uv__get_cgroup1_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2151 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2152                                           uint64_t* max) {
2153   char filename[4097];
2154   char* p;
2155   int n;
2156   uint64_t cgroup1_max;
2157 
2158   /* Find out where the controller is mounted. */
2159   p = uv__cgroup1_find_memory_controller(buf, &n);
2160   if (p != NULL) {
2161     snprintf(filename, sizeof(filename),
2162              "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2163     *high = uv__read_uint64(filename);
2164 
2165     snprintf(filename, sizeof(filename),
2166              "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2167     *max = uv__read_uint64(filename);
2168 
2169     /* If the controller wasn't mounted, the reads above will have failed,
2170      * as indicated by uv__read_uint64 returning 0.
2171      */
2172      if (*high != 0 && *max != 0)
2173        goto update_limits;
2174   }
2175 
2176   /* Fall back to the limits of the global memory controller. */
2177   *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2178   *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2179 
2180   /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2181    * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2182    */
2183 update_limits:
2184   cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2185   if (*high == cgroup1_max)
2186     *high = UINT64_MAX;
2187   if (*max == cgroup1_max)
2188     *max = UINT64_MAX;
2189 }
2190 
uv__get_cgroup2_memory_limits(char buf[static1024],uint64_t * high,uint64_t * max)2191 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2192                                           uint64_t* max) {
2193   char filename[4097];
2194   char* p;
2195   int n;
2196 
2197   /* Find out where the controller is mounted. */
2198   p = buf + strlen("0::/");
2199   n = (int) strcspn(p, "\n");
2200 
2201   /* Read the memory limits of the controller. */
2202   snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2203   *max = uv__read_uint64(filename);
2204   snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2205   *high = uv__read_uint64(filename);
2206 }
2207 
uv__get_cgroup_constrained_memory(char buf[static1024])2208 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2209   uint64_t high;
2210   uint64_t max;
2211 
2212   /* In the case of cgroupv2, we'll only have a single entry. */
2213   if (strncmp(buf, "0::/", 4))
2214     uv__get_cgroup1_memory_limits(buf, &high, &max);
2215   else
2216     uv__get_cgroup2_memory_limits(buf, &high, &max);
2217 
2218   if (high == 0 || max == 0)
2219     return 0;
2220 
2221   return high < max ? high : max;
2222 }
2223 
uv_get_constrained_memory(void)2224 uint64_t uv_get_constrained_memory(void) {
2225   char buf[1024];
2226 
2227   if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2228     return 0;
2229 
2230   return uv__get_cgroup_constrained_memory(buf);
2231 }
2232 
2233 
uv__get_cgroup1_current_memory(char buf[static1024])2234 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2235   char filename[4097];
2236   uint64_t current;
2237   char* p;
2238   int n;
2239 
2240   /* Find out where the controller is mounted. */
2241   p = uv__cgroup1_find_memory_controller(buf, &n);
2242   if (p != NULL) {
2243     snprintf(filename, sizeof(filename),
2244             "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2245     current = uv__read_uint64(filename);
2246 
2247     /* If the controller wasn't mounted, the reads above will have failed,
2248      * as indicated by uv__read_uint64 returning 0.
2249      */
2250     if (current != 0)
2251       return current;
2252   }
2253 
2254   /* Fall back to the usage of the global memory controller. */
2255   return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2256 }
2257 
uv__get_cgroup2_current_memory(char buf[static1024])2258 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2259   char filename[4097];
2260   char* p;
2261   int n;
2262 
2263   /* Find out where the controller is mounted. */
2264   p = buf + strlen("0::/");
2265   n = (int) strcspn(p, "\n");
2266 
2267   snprintf(filename, sizeof(filename),
2268            "/sys/fs/cgroup/%.*s/memory.current", n, p);
2269   return uv__read_uint64(filename);
2270 }
2271 
uv_get_available_memory(void)2272 uint64_t uv_get_available_memory(void) {
2273   char buf[1024];
2274   uint64_t constrained;
2275   uint64_t current;
2276   uint64_t total;
2277 
2278   if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2279     return 0;
2280 
2281   constrained = uv__get_cgroup_constrained_memory(buf);
2282   if (constrained == 0)
2283     return uv_get_free_memory();
2284 
2285   total = uv_get_total_memory();
2286   if (constrained > total)
2287     return uv_get_free_memory();
2288 
2289   /* In the case of cgroupv2, we'll only have a single entry. */
2290   if (strncmp(buf, "0::/", 4))
2291     current = uv__get_cgroup1_current_memory(buf);
2292   else
2293     current = uv__get_cgroup2_current_memory(buf);
2294 
2295   /* memory usage can be higher than the limit (for short bursts of time) */
2296   if (constrained < current)
2297     return 0;
2298 
2299   return constrained - current;
2300 }
2301 
2302 
uv__get_cgroupv2_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2303 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2304                                             uv__cpu_constraint* constraint) {
2305   char path[256];
2306   char buf[1024];
2307   unsigned int weight;
2308   int cgroup_size;
2309   const char* cgroup_trimmed;
2310   char quota_buf[16];
2311 
2312   if (strncmp(cgroup, "0::/", 4) != 0)
2313     return UV_EINVAL;
2314 
2315   /* Trim ending \n by replacing it with a 0 */
2316   cgroup_trimmed = cgroup + sizeof("0::/") - 1;      /* Skip the prefix "0::/" */
2317   cgroup_size = (int)strcspn(cgroup_trimmed, "\n");  /* Find the first slash */
2318 
2319   /* Construct the path to the cpu.max file */
2320   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.max", cgroup_size,
2321            cgroup_trimmed);
2322 
2323   /* Read cpu.max */
2324   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2325     return UV_EIO;
2326 
2327   if (sscanf(buf, "%15s %llu", quota_buf, &constraint->period_length) != 2)
2328     return UV_EINVAL;
2329 
2330   if (strncmp(quota_buf, "max", 3) == 0)
2331     constraint->quota_per_period = LLONG_MAX;
2332   else if (sscanf(quota_buf, "%lld", &constraint->quota_per_period) != 1)
2333     return UV_EINVAL; // conversion failed
2334 
2335   /* Construct the path to the cpu.weight file */
2336   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.weight", cgroup_size,
2337            cgroup_trimmed);
2338 
2339   /* Read cpu.weight */
2340   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2341     return UV_EIO;
2342 
2343   if (sscanf(buf, "%u", &weight) != 1)
2344     return UV_EINVAL;
2345 
2346   constraint->proportions = (double)weight / 100.0;
2347 
2348   return 0;
2349 }
2350 
uv__cgroup1_find_cpu_controller(const char * cgroup,int * cgroup_size)2351 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2352                                              int* cgroup_size) {
2353   /* Seek to the cpu controller line. */
2354   char* cgroup_cpu = strstr(cgroup, ":cpu,");
2355 
2356   if (cgroup_cpu != NULL) {
2357     /* Skip the controller prefix to the start of the cgroup path. */
2358     cgroup_cpu += sizeof(":cpu,") - 1;
2359     /* Determine the length of the cgroup path, excluding the newline. */
2360     *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2361   }
2362 
2363   return cgroup_cpu;
2364 }
2365 
uv__get_cgroupv1_constrained_cpu(const char * cgroup,uv__cpu_constraint * constraint)2366 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2367                                             uv__cpu_constraint* constraint) {
2368   char path[256];
2369   char buf[1024];
2370   unsigned int shares;
2371   int cgroup_size;
2372   char* cgroup_cpu;
2373 
2374   cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2375 
2376   if (cgroup_cpu == NULL)
2377     return UV_EIO;
2378 
2379   /* Construct the path to the cpu.cfs_quota_us file */
2380   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2381            cgroup_size, cgroup_cpu);
2382 
2383   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2384     return UV_EIO;
2385 
2386   if (sscanf(buf, "%lld", &constraint->quota_per_period) != 1)
2387     return UV_EINVAL;
2388 
2389   /* Construct the path to the cpu.cfs_period_us file */
2390   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2391            cgroup_size, cgroup_cpu);
2392 
2393   /* Read cpu.cfs_period_us */
2394   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2395     return UV_EIO;
2396 
2397   if (sscanf(buf, "%lld", &constraint->period_length) != 1)
2398     return UV_EINVAL;
2399 
2400   /* Construct the path to the cpu.shares file */
2401   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.shares", cgroup_size,
2402            cgroup_cpu);
2403 
2404   /* Read cpu.shares */
2405   if (uv__slurp(path, buf, sizeof(buf)) < 0)
2406     return UV_EIO;
2407 
2408   if (sscanf(buf, "%u", &shares) != 1)
2409     return UV_EINVAL;
2410 
2411   constraint->proportions = (double)shares / 1024.0;
2412 
2413   return 0;
2414 }
2415 
uv__get_constrained_cpu(uv__cpu_constraint * constraint)2416 int uv__get_constrained_cpu(uv__cpu_constraint* constraint) {
2417   char cgroup[1024];
2418 
2419   /* Read the cgroup from /proc/self/cgroup */
2420   if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2421     return UV_EIO;
2422 
2423   /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2424    * The entry for cgroup v2 is always in the format "0::$PATH"
2425    * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2426   if (strncmp(cgroup, "0::/", 4) == 0)
2427     return uv__get_cgroupv2_constrained_cpu(cgroup, constraint);
2428   else
2429     return uv__get_cgroupv1_constrained_cpu(cgroup, constraint);
2430 }
2431 
2432 
uv_loadavg(double avg[3])2433 void uv_loadavg(double avg[3]) {
2434   struct sysinfo info;
2435   char buf[128];  /* Large enough to hold all of /proc/loadavg. */
2436 
2437   if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2438     if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2439       return;
2440 
2441   if (sysinfo(&info) < 0)
2442     return;
2443 
2444   avg[0] = (double) info.loads[0] / 65536.0;
2445   avg[1] = (double) info.loads[1] / 65536.0;
2446   avg[2] = (double) info.loads[2] / 65536.0;
2447 }
2448 
2449 
compare_watchers(const struct watcher_list * a,const struct watcher_list * b)2450 static int compare_watchers(const struct watcher_list* a,
2451                             const struct watcher_list* b) {
2452   if (a->wd < b->wd) return -1;
2453   if (a->wd > b->wd) return 1;
2454   return 0;
2455 }
2456 
2457 
init_inotify(uv_loop_t * loop)2458 static int init_inotify(uv_loop_t* loop) {
2459   int fd;
2460 
2461   if (loop->inotify_fd != -1)
2462     return 0;
2463 
2464   fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2465   if (fd < 0)
2466     return UV__ERR(errno);
2467 
2468   loop->inotify_fd = fd;
2469   uv__io_init(&loop->inotify_read_watcher, uv__inotify_read, loop->inotify_fd);
2470   uv__io_start(loop, &loop->inotify_read_watcher, POLLIN);
2471 
2472   return 0;
2473 }
2474 
2475 
uv__inotify_fork(uv_loop_t * loop,struct watcher_list * root)2476 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2477   /* Open the inotify_fd, and re-arm all the inotify watchers. */
2478   int err;
2479   struct watcher_list* tmp_watcher_list_iter;
2480   struct watcher_list* watcher_list;
2481   struct watcher_list tmp_watcher_list;
2482   struct uv__queue queue;
2483   struct uv__queue* q;
2484   uv_fs_event_t* handle;
2485   char* tmp_path;
2486 
2487   if (root == NULL)
2488     return 0;
2489 
2490   /* We must restore the old watcher list to be able to close items
2491    * out of it.
2492    */
2493   loop->inotify_watchers = root;
2494 
2495   uv__queue_init(&tmp_watcher_list.watchers);
2496   /* Note that the queue we use is shared with the start and stop()
2497    * functions, making uv__queue_foreach unsafe to use. So we use the
2498    * uv__queue_move trick to safely iterate. Also don't free the watcher
2499    * list until we're done iterating. c.f. uv__inotify_read.
2500    */
2501   RB_FOREACH_SAFE(watcher_list, watcher_root,
2502                   uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2503     watcher_list->iterating = 1;
2504     uv__queue_move(&watcher_list->watchers, &queue);
2505     while (!uv__queue_empty(&queue)) {
2506       q = uv__queue_head(&queue);
2507       handle = uv__queue_data(q, uv_fs_event_t, watchers);
2508       /* It's critical to keep a copy of path here, because it
2509        * will be set to NULL by stop() and then deallocated by
2510        * maybe_free_watcher_list
2511        */
2512       tmp_path = uv__strdup(handle->path);
2513       assert(tmp_path != NULL);
2514       uv__queue_remove(q);
2515       uv__queue_insert_tail(&watcher_list->watchers, q);
2516       uv_fs_event_stop(handle);
2517 
2518       uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2519       handle->path = tmp_path;
2520     }
2521     watcher_list->iterating = 0;
2522     maybe_free_watcher_list(watcher_list, loop);
2523   }
2524 
2525   uv__queue_move(&tmp_watcher_list.watchers, &queue);
2526   while (!uv__queue_empty(&queue)) {
2527       q = uv__queue_head(&queue);
2528       uv__queue_remove(q);
2529       handle = uv__queue_data(q, uv_fs_event_t, watchers);
2530       tmp_path = handle->path;
2531       handle->path = NULL;
2532       err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2533       uv__free(tmp_path);
2534       if (err)
2535         return err;
2536   }
2537 
2538   return 0;
2539 }
2540 
2541 
find_watcher(uv_loop_t * loop,int wd)2542 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2543   struct watcher_list w;
2544   w.wd = wd;
2545   return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2546 }
2547 
2548 
maybe_free_watcher_list(struct watcher_list * w,uv_loop_t * loop)2549 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2550   /* if the watcher_list->watchers is being iterated over, we can't free it. */
2551   if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2552     /* No watchers left for this path. Clean up. */
2553     RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2554     inotify_rm_watch(loop->inotify_fd, w->wd);
2555     uv__free(w);
2556   }
2557 }
2558 
2559 
uv__inotify_read(uv_loop_t * loop,uv__io_t * dummy,unsigned int events)2560 static void uv__inotify_read(uv_loop_t* loop,
2561                              uv__io_t* dummy,
2562                              unsigned int events) {
2563   const struct inotify_event* e;
2564   struct watcher_list* w;
2565   uv_fs_event_t* h;
2566   struct uv__queue queue;
2567   struct uv__queue* q;
2568   const char* path;
2569   ssize_t size;
2570   const char *p;
2571   /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2572   char buf[4096];
2573 
2574   for (;;) {
2575     do
2576       size = read(loop->inotify_fd, buf, sizeof(buf));
2577     while (size == -1 && errno == EINTR);
2578 
2579     if (size == -1) {
2580       assert(errno == EAGAIN || errno == EWOULDBLOCK);
2581       break;
2582     }
2583 
2584     assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2585 
2586     /* Now we have one or more inotify_event structs. */
2587     for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2588       e = (const struct inotify_event*) p;
2589 
2590       events = 0;
2591       if (e->mask & (IN_ATTRIB|IN_MODIFY))
2592         events |= UV_CHANGE;
2593       if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2594         events |= UV_RENAME;
2595 
2596       w = find_watcher(loop, e->wd);
2597       if (w == NULL)
2598         continue; /* Stale event, no watchers left. */
2599 
2600       /* inotify does not return the filename when monitoring a single file
2601        * for modifications. Repurpose the filename for API compatibility.
2602        * I'm not convinced this is a good thing, maybe it should go.
2603        */
2604       path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2605 
2606       /* We're about to iterate over the queue and call user's callbacks.
2607        * What can go wrong?
2608        * A callback could call uv_fs_event_stop()
2609        * and the queue can change under our feet.
2610        * So, we use uv__queue_move() trick to safely iterate over the queue.
2611        * And we don't free the watcher_list until we're done iterating.
2612        *
2613        * First,
2614        * tell uv_fs_event_stop() (that could be called from a user's callback)
2615        * not to free watcher_list.
2616        */
2617       w->iterating = 1;
2618       uv__queue_move(&w->watchers, &queue);
2619       while (!uv__queue_empty(&queue)) {
2620         q = uv__queue_head(&queue);
2621         h = uv__queue_data(q, uv_fs_event_t, watchers);
2622 
2623         uv__queue_remove(q);
2624         uv__queue_insert_tail(&w->watchers, q);
2625 
2626         h->cb(h, path, events, 0);
2627       }
2628       /* done iterating, time to (maybe) free empty watcher_list */
2629       w->iterating = 0;
2630       maybe_free_watcher_list(w, loop);
2631     }
2632   }
2633 }
2634 
2635 
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)2636 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2637   uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2638   return 0;
2639 }
2640 
2641 
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)2642 int uv_fs_event_start(uv_fs_event_t* handle,
2643                       uv_fs_event_cb cb,
2644                       const char* path,
2645                       unsigned int flags) {
2646   struct watcher_list* w;
2647   uv_loop_t* loop;
2648   size_t len;
2649   int events;
2650   int err;
2651   int wd;
2652 
2653   if (uv__is_active(handle))
2654     return UV_EINVAL;
2655 
2656   loop = handle->loop;
2657 
2658   err = init_inotify(loop);
2659   if (err)
2660     return err;
2661 
2662   events = IN_ATTRIB
2663          | IN_CREATE
2664          | IN_MODIFY
2665          | IN_DELETE
2666          | IN_DELETE_SELF
2667          | IN_MOVE_SELF
2668          | IN_MOVED_FROM
2669          | IN_MOVED_TO;
2670 
2671   wd = inotify_add_watch(loop->inotify_fd, path, events);
2672   if (wd == -1)
2673     return UV__ERR(errno);
2674 
2675   w = find_watcher(loop, wd);
2676   if (w)
2677     goto no_insert;
2678 
2679   len = strlen(path) + 1;
2680   w = uv__malloc(sizeof(*w) + len);
2681   if (w == NULL)
2682     return UV_ENOMEM;
2683 
2684   w->wd = wd;
2685   w->path = memcpy(w + 1, path, len);
2686   uv__queue_init(&w->watchers);
2687   w->iterating = 0;
2688   RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2689 
2690 no_insert:
2691   uv__handle_start(handle);
2692   uv__queue_insert_tail(&w->watchers, &handle->watchers);
2693   handle->path = w->path;
2694   handle->cb = cb;
2695   handle->wd = wd;
2696 
2697   return 0;
2698 }
2699 
2700 
uv_fs_event_stop(uv_fs_event_t * handle)2701 int uv_fs_event_stop(uv_fs_event_t* handle) {
2702   struct watcher_list* w;
2703 
2704   if (!uv__is_active(handle))
2705     return 0;
2706 
2707   w = find_watcher(handle->loop, handle->wd);
2708   assert(w != NULL);
2709 
2710   handle->wd = -1;
2711   handle->path = NULL;
2712   uv__handle_stop(handle);
2713   uv__queue_remove(&handle->watchers);
2714 
2715   maybe_free_watcher_list(w, handle->loop);
2716 
2717   return 0;
2718 }
2719 
2720 
uv__fs_event_close(uv_fs_event_t * handle)2721 void uv__fs_event_close(uv_fs_event_t* handle) {
2722   uv_fs_event_stop(handle);
2723 }
2724