xref: /libuv/src/unix/kqueue.c (revision 44e61dab)
1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2  * Permission is hereby granted, free of charge, to any person obtaining a copy
3  * of this software and associated documentation files (the "Software"), to
4  * deal in the Software without restriction, including without limitation the
5  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6  * sell copies of the Software, and to permit persons to whom the Software is
7  * furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in
10  * all copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18  * IN THE SOFTWARE.
19  */
20 
21 #include "uv.h"
22 #include "internal.h"
23 
24 #include <assert.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <errno.h>
28 
29 #include <sys/sysctl.h>
30 #include <sys/types.h>
31 #include <sys/event.h>
32 #include <sys/time.h>
33 #if defined(__FreeBSD__)
34 #include <sys/user.h>
35 #endif
36 #include <unistd.h>
37 #include <fcntl.h>
38 #include <time.h>
39 
40 /*
41  * Required on
42  * - Until at least FreeBSD 11.0
43  * - Older versions of Mac OS X
44  *
45  * http://www.boost.org/doc/libs/1_61_0/boost/asio/detail/kqueue_reactor.hpp
46  */
47 #ifndef EV_OOBAND
48 #define EV_OOBAND  EV_FLAG1
49 #endif
50 
51 static void uv__fs_event(uv_loop_t* loop, uv__io_t* w, unsigned int fflags);
52 
53 
uv__kqueue_init(uv_loop_t * loop)54 int uv__kqueue_init(uv_loop_t* loop) {
55   loop->backend_fd = kqueue();
56   if (loop->backend_fd == -1)
57     return UV__ERR(errno);
58 
59   uv__cloexec(loop->backend_fd, 1);
60 
61   return 0;
62 }
63 
64 
65 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
66 static _Atomic int uv__has_forked_with_cfrunloop;
67 #endif
68 
uv__io_fork(uv_loop_t * loop)69 int uv__io_fork(uv_loop_t* loop) {
70   int err;
71   loop->backend_fd = -1;
72   err = uv__kqueue_init(loop);
73   if (err)
74     return err;
75 
76 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
77   if (loop->cf_state != NULL) {
78     /* We cannot start another CFRunloop and/or thread in the child
79        process; CF aborts if you try or if you try to touch the thread
80        at all to kill it. So the best we can do is ignore it from now
81        on. This means we can't watch directories in the same way
82        anymore (like other BSDs). It also means we cannot properly
83        clean up the allocated resources; calling
84        uv__fsevents_loop_delete from uv_loop_close will crash the
85        process. So we sidestep the issue by pretending like we never
86        started it in the first place.
87     */
88     atomic_store_explicit(&uv__has_forked_with_cfrunloop,
89                           1,
90                           memory_order_relaxed);
91     uv__free(loop->cf_state);
92     loop->cf_state = NULL;
93   }
94 #endif /* #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070 */
95   return err;
96 }
97 
98 
uv__io_check_fd(uv_loop_t * loop,int fd)99 int uv__io_check_fd(uv_loop_t* loop, int fd) {
100   struct kevent ev;
101   int rc;
102   struct stat sb;
103 #ifdef __APPLE__
104   char path[MAXPATHLEN];
105 #endif
106 
107   if (uv__fstat(fd, &sb))
108     return UV__ERR(errno);
109 
110   /* On FreeBSD, kqueue only supports EVFILT_READ notification for regular files
111    * and always reports ready events for writing, resulting in busy-looping.
112    *
113    * On Darwin, DragonFlyBSD, NetBSD and OpenBSD, kqueue reports ready events for
114    * regular files as readable and writable only once, acting like an EV_ONESHOT.
115    *
116    * Neither of the above cases should be added to the kqueue.
117    */
118   if (S_ISREG(sb.st_mode) || S_ISDIR(sb.st_mode))
119     return UV_EINVAL;
120 
121 #ifdef __APPLE__
122   /* On Darwin (both macOS and iOS), in addition to regular files, FIFOs also don't
123    * work properly with kqueue: the disconnection from the last writer won't trigger
124    * an event for kqueue in spite of what the man pages say. Thus, we also disallow
125    * the case of S_IFIFO. */
126   if (S_ISFIFO(sb.st_mode)) {
127     /* File descriptors of FIFO, pipe and kqueue share the same type of file,
128      * therefore there is no way to tell them apart via stat.st_mode&S_IFMT.
129      * Fortunately, FIFO is the only one that has a persisted file on filesystem,
130      * from which we're able to make the distinction for it. */
131     if (!fcntl(fd, F_GETPATH, path))
132       return UV_EINVAL;
133   }
134 #endif
135 
136   rc = 0;
137   EV_SET(&ev, fd, EVFILT_READ, EV_ADD, 0, 0, 0);
138   if (kevent(loop->backend_fd, &ev, 1, NULL, 0, NULL))
139     rc = UV__ERR(errno);
140 
141   EV_SET(&ev, fd, EVFILT_READ, EV_DELETE, 0, 0, 0);
142   if (rc == 0)
143     if (kevent(loop->backend_fd, &ev, 1, NULL, 0, NULL))
144       abort();
145 
146   return rc;
147 }
148 
149 
uv__kqueue_delete(int kqfd,const struct kevent * ev)150 static void uv__kqueue_delete(int kqfd, const struct kevent *ev) {
151   struct kevent change;
152 
153   EV_SET(&change, ev->ident, ev->filter, EV_DELETE, 0, 0, 0);
154 
155   if (0 == kevent(kqfd, &change, 1, NULL, 0, NULL))
156     return;
157 
158   if (errno == EBADF || errno == ENOENT)
159     return;
160 
161   abort();
162 }
163 
164 
uv__io_poll(uv_loop_t * loop,int timeout)165 void uv__io_poll(uv_loop_t* loop, int timeout) {
166   uv__loop_internal_fields_t* lfields;
167   struct kevent events[1024];
168   struct kevent* ev;
169   struct timespec spec;
170   unsigned int nevents;
171   unsigned int revents;
172   struct uv__queue* q;
173   uv__io_t* w;
174   uv_process_t* process;
175   sigset_t* pset;
176   sigset_t set;
177   uint64_t base;
178   uint64_t diff;
179   int have_signals;
180   int filter;
181   int fflags;
182   int count;
183   int nfds;
184   int fd;
185   int op;
186   int i;
187   int user_timeout;
188   int reset_timeout;
189 
190   if (loop->nfds == 0) {
191     assert(uv__queue_empty(&loop->watcher_queue));
192     return;
193   }
194 
195   lfields = uv__get_internal_fields(loop);
196   nevents = 0;
197 
198   while (!uv__queue_empty(&loop->watcher_queue)) {
199     q = uv__queue_head(&loop->watcher_queue);
200     uv__queue_remove(q);
201     uv__queue_init(q);
202 
203     w = uv__queue_data(q, uv__io_t, watcher_queue);
204     assert(w->pevents != 0);
205     assert(w->fd >= 0);
206     assert(w->fd < (int) loop->nwatchers);
207 
208     if ((w->events & POLLIN) == 0 && (w->pevents & POLLIN) != 0) {
209       filter = EVFILT_READ;
210       fflags = 0;
211       op = EV_ADD;
212 
213       if (w->cb == uv__fs_event) {
214         filter = EVFILT_VNODE;
215         fflags = NOTE_ATTRIB | NOTE_WRITE  | NOTE_RENAME
216                | NOTE_DELETE | NOTE_EXTEND | NOTE_REVOKE;
217         op = EV_ADD | EV_ONESHOT; /* Stop the event from firing repeatedly. */
218       }
219 
220       EV_SET(events + nevents, w->fd, filter, op, fflags, 0, 0);
221 
222       if (++nevents == ARRAY_SIZE(events)) {
223         if (kevent(loop->backend_fd, events, nevents, NULL, 0, NULL))
224           abort();
225         nevents = 0;
226       }
227     }
228 
229     if ((w->events & POLLOUT) == 0 && (w->pevents & POLLOUT) != 0) {
230       EV_SET(events + nevents, w->fd, EVFILT_WRITE, EV_ADD, 0, 0, 0);
231 
232       if (++nevents == ARRAY_SIZE(events)) {
233         if (kevent(loop->backend_fd, events, nevents, NULL, 0, NULL))
234           abort();
235         nevents = 0;
236       }
237     }
238 
239    if ((w->events & UV__POLLPRI) == 0 && (w->pevents & UV__POLLPRI) != 0) {
240       EV_SET(events + nevents, w->fd, EV_OOBAND, EV_ADD, 0, 0, 0);
241 
242       if (++nevents == ARRAY_SIZE(events)) {
243         if (kevent(loop->backend_fd, events, nevents, NULL, 0, NULL))
244           abort();
245         nevents = 0;
246       }
247     }
248 
249     w->events = w->pevents;
250   }
251 
252   pset = NULL;
253   if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
254     pset = &set;
255     sigemptyset(pset);
256     sigaddset(pset, SIGPROF);
257   }
258 
259   assert(timeout >= -1);
260   base = loop->time;
261   count = 48; /* Benchmarks suggest this gives the best throughput. */
262 
263   if (lfields->flags & UV_METRICS_IDLE_TIME) {
264     reset_timeout = 1;
265     user_timeout = timeout;
266     timeout = 0;
267   } else {
268     reset_timeout = 0;
269   }
270 
271   for (;; nevents = 0) {
272     /* Only need to set the provider_entry_time if timeout != 0. The function
273      * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
274      */
275     if (timeout != 0)
276       uv__metrics_set_provider_entry_time(loop);
277 
278     if (timeout != -1) {
279       spec.tv_sec = timeout / 1000;
280       spec.tv_nsec = (timeout % 1000) * 1000000;
281     }
282 
283     if (pset != NULL)
284       pthread_sigmask(SIG_BLOCK, pset, NULL);
285 
286     /* Store the current timeout in a location that's globally accessible so
287      * other locations like uv__work_done() can determine whether the queue
288      * of events in the callback were waiting when poll was called.
289      */
290     lfields->current_timeout = timeout;
291 
292     nfds = kevent(loop->backend_fd,
293                   events,
294                   nevents,
295                   events,
296                   ARRAY_SIZE(events),
297                   timeout == -1 ? NULL : &spec);
298 
299     if (nfds == -1)
300       assert(errno == EINTR);
301     else if (nfds == 0)
302       /* Unlimited timeout should only return with events or signal. */
303       assert(timeout != -1);
304 
305     if (pset != NULL)
306       pthread_sigmask(SIG_UNBLOCK, pset, NULL);
307 
308     /* Update loop->time unconditionally. It's tempting to skip the update when
309      * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
310      * operating system didn't reschedule our process while in the syscall.
311      */
312     uv__update_time(loop);
313 
314     if (nfds == 0 || nfds == -1) {
315       /* If kqueue is empty or interrupted, we might still have children ready
316        * to reap immediately. */
317       if (loop->flags & UV_LOOP_REAP_CHILDREN) {
318         loop->flags &= ~UV_LOOP_REAP_CHILDREN;
319         uv__wait_children(loop);
320         assert((reset_timeout == 0 ? timeout : user_timeout) == 0);
321         return; /* Equivalent to fall-through behavior. */
322       }
323 
324       if (reset_timeout != 0) {
325         timeout = user_timeout;
326         reset_timeout = 0;
327       } else if (nfds == 0) {
328         return;
329       }
330 
331       /* Interrupted by a signal. Update timeout and poll again. */
332       goto update_timeout;
333     }
334 
335     have_signals = 0;
336     nevents = 0;
337 
338     assert(loop->watchers != NULL);
339     loop->watchers[loop->nwatchers] = (void*) events;
340     loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds;
341     for (i = 0; i < nfds; i++) {
342       ev = events + i;
343       fd = ev->ident;
344 
345       /* Handle kevent NOTE_EXIT results */
346       if (ev->filter == EVFILT_PROC) {
347         uv__queue_foreach(q, &loop->process_handles) {
348           process = uv__queue_data(q, uv_process_t, queue);
349           if (process->pid == fd) {
350             process->flags |= UV_HANDLE_REAP;
351             loop->flags |= UV_LOOP_REAP_CHILDREN;
352             break;
353           }
354         }
355         nevents++;
356         continue;
357       }
358 
359       /* Skip invalidated events, see uv__platform_invalidate_fd */
360       if (fd == -1)
361         continue;
362       w = loop->watchers[fd];
363 
364       if (w == NULL) {
365         /* File descriptor that we've stopped watching, disarm it. */
366         uv__kqueue_delete(loop->backend_fd, ev);
367         continue;
368       }
369 
370 #if UV__KQUEUE_EVFILT_USER
371       if (ev->filter == EVFILT_USER) {
372         w = &loop->async_io_watcher;
373         assert(fd == w->fd);
374         uv__metrics_update_idle_time(loop);
375         w->cb(loop, w, w->events);
376         nevents++;
377         continue;
378       }
379 #endif
380 
381       if (ev->filter == EVFILT_VNODE) {
382         assert(w->events == POLLIN);
383         assert(w->pevents == POLLIN);
384         uv__metrics_update_idle_time(loop);
385         w->cb(loop, w, ev->fflags); /* XXX always uv__fs_event() */
386         nevents++;
387         continue;
388       }
389 
390       revents = 0;
391 
392       if (ev->filter == EVFILT_READ) {
393         if (w->pevents & POLLIN)
394           revents |= POLLIN;
395         else
396           uv__kqueue_delete(loop->backend_fd, ev);
397 
398         if ((ev->flags & EV_EOF) && (w->pevents & UV__POLLRDHUP))
399           revents |= UV__POLLRDHUP;
400       }
401 
402       if (ev->filter == EV_OOBAND) {
403         if (w->pevents & UV__POLLPRI)
404           revents |= UV__POLLPRI;
405         else
406           uv__kqueue_delete(loop->backend_fd, ev);
407       }
408 
409       if (ev->filter == EVFILT_WRITE) {
410         if (w->pevents & POLLOUT)
411           revents |= POLLOUT;
412         else
413           uv__kqueue_delete(loop->backend_fd, ev);
414       }
415 
416       if (ev->flags & EV_ERROR)
417         revents |= POLLERR;
418 
419       if (revents == 0)
420         continue;
421 
422       /* Run signal watchers last.  This also affects child process watchers
423        * because those are implemented in terms of signal watchers.
424        */
425       if (w == &loop->signal_io_watcher) {
426         have_signals = 1;
427       } else {
428         uv__metrics_update_idle_time(loop);
429         w->cb(loop, w, revents);
430       }
431 
432       nevents++;
433     }
434 
435     if (loop->flags & UV_LOOP_REAP_CHILDREN) {
436       loop->flags &= ~UV_LOOP_REAP_CHILDREN;
437       uv__wait_children(loop);
438     }
439 
440     uv__metrics_inc_events(loop, nevents);
441     if (reset_timeout != 0) {
442       timeout = user_timeout;
443       reset_timeout = 0;
444       uv__metrics_inc_events_waiting(loop, nevents);
445     }
446 
447     if (have_signals != 0) {
448       uv__metrics_update_idle_time(loop);
449       loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
450     }
451 
452     loop->watchers[loop->nwatchers] = NULL;
453     loop->watchers[loop->nwatchers + 1] = NULL;
454 
455     if (have_signals != 0)
456       return;  /* Event loop should cycle now so don't poll again. */
457 
458     if (nevents != 0) {
459       if (nfds == ARRAY_SIZE(events) && --count != 0) {
460         /* Poll for more events but don't block this time. */
461         timeout = 0;
462         continue;
463       }
464       return;
465     }
466 
467 update_timeout:
468     if (timeout == 0)
469       return;
470 
471     if (timeout == -1)
472       continue;
473 
474     assert(timeout > 0);
475 
476     diff = loop->time - base;
477     if (diff >= (uint64_t) timeout)
478       return;
479 
480     timeout -= diff;
481   }
482 }
483 
484 
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)485 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
486   struct kevent* events;
487   uintptr_t i;
488   uintptr_t nfds;
489 
490   assert(loop->watchers != NULL);
491   assert(fd >= 0);
492 
493   events = (struct kevent*) loop->watchers[loop->nwatchers];
494   nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1];
495   if (events == NULL)
496     return;
497 
498   /* Invalidate events with same file descriptor */
499   for (i = 0; i < nfds; i++)
500     if ((int) events[i].ident == fd && events[i].filter != EVFILT_PROC)
501       events[i].ident = -1;
502 }
503 
504 
uv__fs_event(uv_loop_t * loop,uv__io_t * w,unsigned int fflags)505 static void uv__fs_event(uv_loop_t* loop, uv__io_t* w, unsigned int fflags) {
506   uv_fs_event_t* handle;
507   struct kevent ev;
508   int events;
509   const char* path;
510 #if defined(F_GETPATH)
511   /* MAXPATHLEN == PATH_MAX but the former is what XNU calls it internally. */
512   char pathbuf[MAXPATHLEN];
513 #endif
514 
515   handle = container_of(w, uv_fs_event_t, event_watcher);
516 
517   if (fflags & (NOTE_ATTRIB | NOTE_EXTEND))
518     events = UV_CHANGE;
519   else
520     events = UV_RENAME;
521 
522   path = NULL;
523 #if defined(F_GETPATH)
524   /* Also works when the file has been unlinked from the file system. Passing
525    * in the path when the file has been deleted is arguably a little strange
526    * but it's consistent with what the inotify backend does.
527    */
528   if (fcntl(handle->event_watcher.fd, F_GETPATH, pathbuf) == 0)
529     path = uv__basename_r(pathbuf);
530 #elif defined(F_KINFO)
531   /* We try to get the file info reference from the file descriptor.
532    * the struct's kf_structsize must be initialised beforehand
533    * whether with the KINFO_FILE_SIZE constant or this way.
534    */
535   struct stat statbuf;
536   struct kinfo_file kf;
537 
538   if (handle->event_watcher.fd != -1 &&
539      (!uv__fstat(handle->event_watcher.fd, &statbuf) && !(statbuf.st_mode & S_IFDIR))) {
540      /* we are purposely not using KINFO_FILE_SIZE here
541       * as it is not available on non intl archs
542       * and here it gives 1392 too on intel.
543       * anyway, the man page also mentions we can proceed
544       * this way.
545       */
546      kf.kf_structsize = sizeof(kf);
547      if (fcntl(handle->event_watcher.fd, F_KINFO, &kf) == 0)
548        path = uv__basename_r(kf.kf_path);
549   }
550 #endif
551   handle->cb(handle, path, events, 0);
552 
553   if (handle->event_watcher.fd == -1)
554     return;
555 
556   /* Watcher operates in one-shot mode, re-arm it. */
557   fflags = NOTE_ATTRIB | NOTE_WRITE  | NOTE_RENAME
558          | NOTE_DELETE | NOTE_EXTEND | NOTE_REVOKE;
559 
560   EV_SET(&ev, w->fd, EVFILT_VNODE, EV_ADD | EV_ONESHOT, fflags, 0, 0);
561 
562   if (kevent(loop->backend_fd, &ev, 1, NULL, 0, NULL))
563     abort();
564 }
565 
566 
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)567 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
568   uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
569   return 0;
570 }
571 
572 
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)573 int uv_fs_event_start(uv_fs_event_t* handle,
574                       uv_fs_event_cb cb,
575                       const char* path,
576                       unsigned int flags) {
577   int fd;
578 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
579   struct stat statbuf;
580 #endif
581 
582   if (uv__is_active(handle))
583     return UV_EINVAL;
584 
585   handle->cb = cb;
586   handle->path = uv__strdup(path);
587   if (handle->path == NULL)
588     return UV_ENOMEM;
589 
590   /* TODO open asynchronously - but how do we report back errors? */
591   fd = open(handle->path, O_RDONLY);
592   if (fd == -1) {
593     uv__free(handle->path);
594     handle->path = NULL;
595     return UV__ERR(errno);
596   }
597 
598 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
599   /* Nullify field to perform checks later */
600   handle->cf_cb = NULL;
601   handle->realpath = NULL;
602   handle->realpath_len = 0;
603   handle->cf_flags = flags;
604 
605   if (uv__fstat(fd, &statbuf))
606     goto fallback;
607   /* FSEvents works only with directories */
608   if (!(statbuf.st_mode & S_IFDIR))
609     goto fallback;
610 
611   if (0 == atomic_load_explicit(&uv__has_forked_with_cfrunloop,
612                                 memory_order_relaxed)) {
613     int r;
614     /* The fallback fd is no longer needed */
615     uv__close_nocheckstdio(fd);
616     handle->event_watcher.fd = -1;
617     r = uv__fsevents_init(handle);
618     if (r == 0) {
619       uv__handle_start(handle);
620     } else {
621       uv__free(handle->path);
622       handle->path = NULL;
623     }
624     return r;
625   }
626 fallback:
627 #endif /* #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070 */
628 
629   uv__handle_start(handle);
630   uv__io_init(&handle->event_watcher, uv__fs_event, fd);
631   uv__io_start(handle->loop, &handle->event_watcher, POLLIN);
632 
633   return 0;
634 }
635 
636 
uv_fs_event_stop(uv_fs_event_t * handle)637 int uv_fs_event_stop(uv_fs_event_t* handle) {
638   int r;
639   r = 0;
640 
641   if (!uv__is_active(handle))
642     return 0;
643 
644   uv__handle_stop(handle);
645 
646 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
647   if (0 == atomic_load_explicit(&uv__has_forked_with_cfrunloop,
648                                 memory_order_relaxed))
649     if (handle->cf_cb != NULL)
650       r = uv__fsevents_close(handle);
651 #endif
652 
653   if (handle->event_watcher.fd != -1) {
654     uv__io_close(handle->loop, &handle->event_watcher);
655     uv__close(handle->event_watcher.fd);
656     handle->event_watcher.fd = -1;
657   }
658 
659   uv__free(handle->path);
660   handle->path = NULL;
661 
662   return r;
663 }
664 
665 
uv__fs_event_close(uv_fs_event_t * handle)666 void uv__fs_event_close(uv_fs_event_t* handle) {
667   uv_fs_event_stop(handle);
668 }
669