xref: /libuv/src/unix/kqueue.c (revision b7d07d78)
1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2  * Permission is hereby granted, free of charge, to any person obtaining a copy
3  * of this software and associated documentation files (the "Software"), to
4  * deal in the Software without restriction, including without limitation the
5  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6  * sell copies of the Software, and to permit persons to whom the Software is
7  * furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in
10  * all copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18  * IN THE SOFTWARE.
19  */
20 
21 #include "uv.h"
22 #include "internal.h"
23 
24 #include <assert.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <errno.h>
28 
29 #include <sys/sysctl.h>
30 #include <sys/types.h>
31 #include <sys/event.h>
32 #include <sys/time.h>
33 #if defined(__FreeBSD__)
34 #include <sys/user.h>
35 #endif
36 #include <unistd.h>
37 #include <fcntl.h>
38 #include <time.h>
39 
40 /*
41  * Required on
42  * - Until at least FreeBSD 11.0
43  * - Older versions of Mac OS X
44  *
45  * http://www.boost.org/doc/libs/1_61_0/boost/asio/detail/kqueue_reactor.hpp
46  */
47 #ifndef EV_OOBAND
48 #define EV_OOBAND  EV_FLAG1
49 #endif
50 
51 static void uv__fs_event(uv_loop_t* loop, uv__io_t* w, unsigned int fflags);
52 
53 
uv__kqueue_init(uv_loop_t * loop)54 int uv__kqueue_init(uv_loop_t* loop) {
55   loop->backend_fd = kqueue();
56   if (loop->backend_fd == -1)
57     return UV__ERR(errno);
58 
59   uv__cloexec(loop->backend_fd, 1);
60 
61   return 0;
62 }
63 
64 
65 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
66 static _Atomic int uv__has_forked_with_cfrunloop;
67 #endif
68 
uv__io_fork(uv_loop_t * loop)69 int uv__io_fork(uv_loop_t* loop) {
70   int err;
71   loop->backend_fd = -1;
72   err = uv__kqueue_init(loop);
73   if (err)
74     return err;
75 
76 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
77   if (loop->cf_state != NULL) {
78     /* We cannot start another CFRunloop and/or thread in the child
79        process; CF aborts if you try or if you try to touch the thread
80        at all to kill it. So the best we can do is ignore it from now
81        on. This means we can't watch directories in the same way
82        anymore (like other BSDs). It also means we cannot properly
83        clean up the allocated resources; calling
84        uv__fsevents_loop_delete from uv_loop_close will crash the
85        process. So we sidestep the issue by pretending like we never
86        started it in the first place.
87     */
88     atomic_store_explicit(&uv__has_forked_with_cfrunloop,
89                           1,
90                           memory_order_relaxed);
91     uv__free(loop->cf_state);
92     loop->cf_state = NULL;
93   }
94 #endif /* #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070 */
95   return err;
96 }
97 
98 
uv__io_check_fd(uv_loop_t * loop,int fd)99 int uv__io_check_fd(uv_loop_t* loop, int fd) {
100   struct kevent ev[2];
101   struct stat sb;
102 #ifdef __APPLE__
103   char path[MAXPATHLEN];
104 #endif
105 
106   if (uv__fstat(fd, &sb))
107     return UV__ERR(errno);
108 
109   /* On FreeBSD, kqueue only supports EVFILT_READ notification for regular files
110    * and always reports ready events for writing, resulting in busy-looping.
111    *
112    * On Darwin, DragonFlyBSD, NetBSD and OpenBSD, kqueue reports ready events for
113    * regular files as readable and writable only once, acting like an EV_ONESHOT.
114    *
115    * Neither of the above cases should be added to the kqueue.
116    */
117   if (S_ISREG(sb.st_mode) || S_ISDIR(sb.st_mode))
118     return UV_EINVAL;
119 
120 #ifdef __APPLE__
121   /* On Darwin (both macOS and iOS), in addition to regular files, FIFOs also don't
122    * work properly with kqueue: the disconnection from the last writer won't trigger
123    * an event for kqueue in spite of what the man pages say. Thus, we also disallow
124    * the case of S_IFIFO. */
125   if (S_ISFIFO(sb.st_mode)) {
126     /* File descriptors of FIFO, pipe and kqueue share the same type of file,
127      * therefore there is no way to tell them apart via stat.st_mode&S_IFMT.
128      * Fortunately, FIFO is the only one that has a persisted file on filesystem,
129      * from which we're able to make the distinction for it. */
130     if (!fcntl(fd, F_GETPATH, path))
131       return UV_EINVAL;
132   }
133 #endif
134 
135   EV_SET(ev, fd, EVFILT_READ, EV_ADD, 0, 0, 0);
136   EV_SET(ev + 1, fd, EVFILT_READ, EV_DELETE, 0, 0, 0);
137   if (kevent(loop->backend_fd, ev, 2, NULL, 0, NULL))
138     return UV__ERR(errno);
139 
140   return 0;
141 }
142 
143 
uv__kqueue_delete(int kqfd,const struct kevent * ev)144 static void uv__kqueue_delete(int kqfd, const struct kevent *ev) {
145   struct kevent change;
146 
147   EV_SET(&change, ev->ident, ev->filter, EV_DELETE, 0, 0, 0);
148 
149   if (0 == kevent(kqfd, &change, 1, NULL, 0, NULL))
150     return;
151 
152   if (errno == EBADF || errno == ENOENT)
153     return;
154 
155   abort();
156 }
157 
158 
uv__io_poll(uv_loop_t * loop,int timeout)159 void uv__io_poll(uv_loop_t* loop, int timeout) {
160   uv__loop_internal_fields_t* lfields;
161   struct kevent events[1024];
162   struct kevent* ev;
163   struct timespec spec;
164   unsigned int nevents;
165   unsigned int revents;
166   struct uv__queue* q;
167   uv__io_t* w;
168   uv_process_t* process;
169   sigset_t* pset;
170   sigset_t set;
171   uint64_t base;
172   uint64_t diff;
173   int have_signals;
174   int filter;
175   int fflags;
176   int count;
177   int nfds;
178   int fd;
179   int op;
180   int i;
181   int user_timeout;
182   int reset_timeout;
183 
184   if (loop->nfds == 0) {
185     assert(uv__queue_empty(&loop->watcher_queue));
186     return;
187   }
188 
189   lfields = uv__get_internal_fields(loop);
190   nevents = 0;
191 
192   while (!uv__queue_empty(&loop->watcher_queue)) {
193     q = uv__queue_head(&loop->watcher_queue);
194     uv__queue_remove(q);
195     uv__queue_init(q);
196 
197     w = uv__queue_data(q, uv__io_t, watcher_queue);
198     assert(w->pevents != 0);
199     assert(w->fd >= 0);
200     assert(w->fd < (int) loop->nwatchers);
201 
202     if ((w->events & POLLIN) == 0 && (w->pevents & POLLIN) != 0) {
203       filter = EVFILT_READ;
204       fflags = 0;
205       op = EV_ADD;
206 
207       if (w->cb == uv__fs_event) {
208         filter = EVFILT_VNODE;
209         fflags = NOTE_ATTRIB | NOTE_WRITE  | NOTE_RENAME
210                | NOTE_DELETE | NOTE_EXTEND | NOTE_REVOKE;
211         op = EV_ADD | EV_ONESHOT; /* Stop the event from firing repeatedly. */
212       }
213 
214       EV_SET(events + nevents, w->fd, filter, op, fflags, 0, 0);
215 
216       if (++nevents == ARRAY_SIZE(events)) {
217         if (kevent(loop->backend_fd, events, nevents, NULL, 0, NULL))
218           abort();
219         nevents = 0;
220       }
221     }
222 
223     if ((w->events & POLLOUT) == 0 && (w->pevents & POLLOUT) != 0) {
224       EV_SET(events + nevents, w->fd, EVFILT_WRITE, EV_ADD, 0, 0, 0);
225 
226       if (++nevents == ARRAY_SIZE(events)) {
227         if (kevent(loop->backend_fd, events, nevents, NULL, 0, NULL))
228           abort();
229         nevents = 0;
230       }
231     }
232 
233    if ((w->events & UV__POLLPRI) == 0 && (w->pevents & UV__POLLPRI) != 0) {
234       EV_SET(events + nevents, w->fd, EV_OOBAND, EV_ADD, 0, 0, 0);
235 
236       if (++nevents == ARRAY_SIZE(events)) {
237         if (kevent(loop->backend_fd, events, nevents, NULL, 0, NULL))
238           abort();
239         nevents = 0;
240       }
241     }
242 
243     w->events = w->pevents;
244   }
245 
246   pset = NULL;
247   if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
248     pset = &set;
249     sigemptyset(pset);
250     sigaddset(pset, SIGPROF);
251   }
252 
253   assert(timeout >= -1);
254   base = loop->time;
255   count = 48; /* Benchmarks suggest this gives the best throughput. */
256 
257   if (lfields->flags & UV_METRICS_IDLE_TIME) {
258     reset_timeout = 1;
259     user_timeout = timeout;
260     timeout = 0;
261   } else {
262     reset_timeout = 0;
263   }
264 
265   for (;; nevents = 0) {
266     /* Only need to set the provider_entry_time if timeout != 0. The function
267      * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
268      */
269     if (timeout != 0)
270       uv__metrics_set_provider_entry_time(loop);
271 
272     if (timeout != -1) {
273       spec.tv_sec = timeout / 1000;
274       spec.tv_nsec = (timeout % 1000) * 1000000;
275     }
276 
277     if (pset != NULL)
278       pthread_sigmask(SIG_BLOCK, pset, NULL);
279 
280     /* Store the current timeout in a location that's globally accessible so
281      * other locations like uv__work_done() can determine whether the queue
282      * of events in the callback were waiting when poll was called.
283      */
284     lfields->current_timeout = timeout;
285 
286     nfds = kevent(loop->backend_fd,
287                   events,
288                   nevents,
289                   events,
290                   ARRAY_SIZE(events),
291                   timeout == -1 ? NULL : &spec);
292 
293     if (nfds == -1)
294       assert(errno == EINTR);
295     else if (nfds == 0)
296       /* Unlimited timeout should only return with events or signal. */
297       assert(timeout != -1);
298 
299     if (pset != NULL)
300       pthread_sigmask(SIG_UNBLOCK, pset, NULL);
301 
302     /* Update loop->time unconditionally. It's tempting to skip the update when
303      * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
304      * operating system didn't reschedule our process while in the syscall.
305      */
306     uv__update_time(loop);
307 
308     if (nfds == 0 || nfds == -1) {
309       /* If kqueue is empty or interrupted, we might still have children ready
310        * to reap immediately. */
311       if (loop->flags & UV_LOOP_REAP_CHILDREN) {
312         loop->flags &= ~UV_LOOP_REAP_CHILDREN;
313         uv__wait_children(loop);
314         assert((reset_timeout == 0 ? timeout : user_timeout) == 0);
315         return; /* Equivalent to fall-through behavior. */
316       }
317 
318       if (reset_timeout != 0) {
319         timeout = user_timeout;
320         reset_timeout = 0;
321       } else if (nfds == 0) {
322         return;
323       }
324 
325       /* Interrupted by a signal. Update timeout and poll again. */
326       goto update_timeout;
327     }
328 
329     have_signals = 0;
330     nevents = 0;
331 
332     assert(loop->watchers != NULL);
333     loop->watchers[loop->nwatchers] = (void*) events;
334     loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds;
335     for (i = 0; i < nfds; i++) {
336       ev = events + i;
337       fd = ev->ident;
338 
339       /* Handle kevent NOTE_EXIT results */
340       if (ev->filter == EVFILT_PROC) {
341         uv__queue_foreach(q, &loop->process_handles) {
342           process = uv__queue_data(q, uv_process_t, queue);
343           if (process->pid == fd) {
344             process->flags |= UV_HANDLE_REAP;
345             loop->flags |= UV_LOOP_REAP_CHILDREN;
346             break;
347           }
348         }
349         nevents++;
350         continue;
351       }
352 
353       /* Skip invalidated events, see uv__platform_invalidate_fd */
354       if (fd == -1)
355         continue;
356       w = loop->watchers[fd];
357 
358       if (w == NULL) {
359         /* File descriptor that we've stopped watching, disarm it. */
360         uv__kqueue_delete(loop->backend_fd, ev);
361         continue;
362       }
363 
364 #if UV__KQUEUE_EVFILT_USER
365       if (ev->filter == EVFILT_USER) {
366         w = &loop->async_io_watcher;
367         assert(fd == w->fd);
368         uv__metrics_update_idle_time(loop);
369         w->cb(loop, w, w->events);
370         nevents++;
371         continue;
372       }
373 #endif
374 
375       if (ev->filter == EVFILT_VNODE) {
376         assert(w->events == POLLIN);
377         assert(w->pevents == POLLIN);
378         uv__metrics_update_idle_time(loop);
379         w->cb(loop, w, ev->fflags); /* XXX always uv__fs_event() */
380         nevents++;
381         continue;
382       }
383 
384       revents = 0;
385 
386       if (ev->filter == EVFILT_READ) {
387         if (w->pevents & POLLIN)
388           revents |= POLLIN;
389         else
390           uv__kqueue_delete(loop->backend_fd, ev);
391 
392         if ((ev->flags & EV_EOF) && (w->pevents & UV__POLLRDHUP))
393           revents |= UV__POLLRDHUP;
394       }
395 
396       if (ev->filter == EV_OOBAND) {
397         if (w->pevents & UV__POLLPRI)
398           revents |= UV__POLLPRI;
399         else
400           uv__kqueue_delete(loop->backend_fd, ev);
401       }
402 
403       if (ev->filter == EVFILT_WRITE) {
404         if (w->pevents & POLLOUT)
405           revents |= POLLOUT;
406         else
407           uv__kqueue_delete(loop->backend_fd, ev);
408       }
409 
410       if (ev->flags & EV_ERROR)
411         revents |= POLLERR;
412 
413       if (revents == 0)
414         continue;
415 
416       /* Run signal watchers last.  This also affects child process watchers
417        * because those are implemented in terms of signal watchers.
418        */
419       if (w == &loop->signal_io_watcher) {
420         have_signals = 1;
421       } else {
422         uv__metrics_update_idle_time(loop);
423         w->cb(loop, w, revents);
424       }
425 
426       nevents++;
427     }
428 
429     if (loop->flags & UV_LOOP_REAP_CHILDREN) {
430       loop->flags &= ~UV_LOOP_REAP_CHILDREN;
431       uv__wait_children(loop);
432     }
433 
434     uv__metrics_inc_events(loop, nevents);
435     if (reset_timeout != 0) {
436       timeout = user_timeout;
437       reset_timeout = 0;
438       uv__metrics_inc_events_waiting(loop, nevents);
439     }
440 
441     if (have_signals != 0) {
442       uv__metrics_update_idle_time(loop);
443       loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
444     }
445 
446     loop->watchers[loop->nwatchers] = NULL;
447     loop->watchers[loop->nwatchers + 1] = NULL;
448 
449     if (have_signals != 0)
450       return;  /* Event loop should cycle now so don't poll again. */
451 
452     if (nevents != 0) {
453       if (nfds == ARRAY_SIZE(events) && --count != 0) {
454         /* Poll for more events but don't block this time. */
455         timeout = 0;
456         continue;
457       }
458       return;
459     }
460 
461 update_timeout:
462     if (timeout == 0)
463       return;
464 
465     if (timeout == -1)
466       continue;
467 
468     assert(timeout > 0);
469 
470     diff = loop->time - base;
471     if (diff >= (uint64_t) timeout)
472       return;
473 
474     timeout -= diff;
475   }
476 }
477 
478 
uv__platform_invalidate_fd(uv_loop_t * loop,int fd)479 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
480   struct kevent* events;
481   uintptr_t i;
482   uintptr_t nfds;
483 
484   assert(loop->watchers != NULL);
485   assert(fd >= 0);
486 
487   events = (struct kevent*) loop->watchers[loop->nwatchers];
488   nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1];
489   if (events == NULL)
490     return;
491 
492   /* Invalidate events with same file descriptor */
493   for (i = 0; i < nfds; i++)
494     if ((int) events[i].ident == fd && events[i].filter != EVFILT_PROC)
495       events[i].ident = -1;
496 }
497 
498 
uv__fs_event(uv_loop_t * loop,uv__io_t * w,unsigned int fflags)499 static void uv__fs_event(uv_loop_t* loop, uv__io_t* w, unsigned int fflags) {
500   uv_fs_event_t* handle;
501   struct kevent ev;
502   int events;
503   const char* path;
504 #if defined(F_GETPATH)
505   /* MAXPATHLEN == PATH_MAX but the former is what XNU calls it internally. */
506   char pathbuf[MAXPATHLEN];
507 #endif
508 
509   handle = container_of(w, uv_fs_event_t, event_watcher);
510 
511   if (fflags & (NOTE_ATTRIB | NOTE_EXTEND))
512     events = UV_CHANGE;
513   else
514     events = UV_RENAME;
515 
516   path = NULL;
517 #if defined(F_GETPATH)
518   /* Also works when the file has been unlinked from the file system. Passing
519    * in the path when the file has been deleted is arguably a little strange
520    * but it's consistent with what the inotify backend does.
521    */
522   if (fcntl(handle->event_watcher.fd, F_GETPATH, pathbuf) == 0)
523     path = uv__basename_r(pathbuf);
524 #elif defined(F_KINFO)
525   /* We try to get the file info reference from the file descriptor.
526    * the struct's kf_structsize must be initialised beforehand
527    * whether with the KINFO_FILE_SIZE constant or this way.
528    */
529   struct stat statbuf;
530   struct kinfo_file kf;
531 
532   if (handle->event_watcher.fd != -1 &&
533      (!uv__fstat(handle->event_watcher.fd, &statbuf) && !(statbuf.st_mode & S_IFDIR))) {
534      /* we are purposely not using KINFO_FILE_SIZE here
535       * as it is not available on non intl archs
536       * and here it gives 1392 too on intel.
537       * anyway, the man page also mentions we can proceed
538       * this way.
539       */
540      kf.kf_structsize = sizeof(kf);
541      if (fcntl(handle->event_watcher.fd, F_KINFO, &kf) == 0)
542        path = uv__basename_r(kf.kf_path);
543   }
544 #endif
545   handle->cb(handle, path, events, 0);
546 
547   if (handle->event_watcher.fd == -1)
548     return;
549 
550   /* Watcher operates in one-shot mode, re-arm it. */
551   fflags = NOTE_ATTRIB | NOTE_WRITE  | NOTE_RENAME
552          | NOTE_DELETE | NOTE_EXTEND | NOTE_REVOKE;
553 
554   EV_SET(&ev, w->fd, EVFILT_VNODE, EV_ADD | EV_ONESHOT, fflags, 0, 0);
555 
556   if (kevent(loop->backend_fd, &ev, 1, NULL, 0, NULL))
557     abort();
558 }
559 
560 
uv_fs_event_init(uv_loop_t * loop,uv_fs_event_t * handle)561 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
562   uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
563   return 0;
564 }
565 
566 
uv_fs_event_start(uv_fs_event_t * handle,uv_fs_event_cb cb,const char * path,unsigned int flags)567 int uv_fs_event_start(uv_fs_event_t* handle,
568                       uv_fs_event_cb cb,
569                       const char* path,
570                       unsigned int flags) {
571   int fd;
572 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
573   struct stat statbuf;
574 #endif
575 
576   if (uv__is_active(handle))
577     return UV_EINVAL;
578 
579   handle->cb = cb;
580   handle->path = uv__strdup(path);
581   if (handle->path == NULL)
582     return UV_ENOMEM;
583 
584   /* TODO open asynchronously - but how do we report back errors? */
585   fd = open(handle->path, O_RDONLY);
586   if (fd == -1) {
587     uv__free(handle->path);
588     handle->path = NULL;
589     return UV__ERR(errno);
590   }
591 
592 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
593   /* Nullify field to perform checks later */
594   handle->cf_cb = NULL;
595   handle->realpath = NULL;
596   handle->realpath_len = 0;
597   handle->cf_flags = flags;
598 
599   if (uv__fstat(fd, &statbuf))
600     goto fallback;
601   /* FSEvents works only with directories */
602   if (!(statbuf.st_mode & S_IFDIR))
603     goto fallback;
604 
605   if (0 == atomic_load_explicit(&uv__has_forked_with_cfrunloop,
606                                 memory_order_relaxed)) {
607     int r;
608     /* The fallback fd is no longer needed */
609     uv__close_nocheckstdio(fd);
610     handle->event_watcher.fd = -1;
611     r = uv__fsevents_init(handle);
612     if (r == 0) {
613       uv__handle_start(handle);
614     } else {
615       uv__free(handle->path);
616       handle->path = NULL;
617     }
618     return r;
619   }
620 fallback:
621 #endif /* #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070 */
622 
623   uv__handle_start(handle);
624   uv__io_init(&handle->event_watcher, uv__fs_event, fd);
625   uv__io_start(handle->loop, &handle->event_watcher, POLLIN);
626 
627   return 0;
628 }
629 
630 
uv_fs_event_stop(uv_fs_event_t * handle)631 int uv_fs_event_stop(uv_fs_event_t* handle) {
632   int r;
633   r = 0;
634 
635   if (!uv__is_active(handle))
636     return 0;
637 
638   uv__handle_stop(handle);
639 
640 #if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED >= 1070
641   if (0 == atomic_load_explicit(&uv__has_forked_with_cfrunloop,
642                                 memory_order_relaxed))
643     if (handle->cf_cb != NULL)
644       r = uv__fsevents_close(handle);
645 #endif
646 
647   if (handle->event_watcher.fd != -1) {
648     uv__io_close(handle->loop, &handle->event_watcher);
649     uv__close(handle->event_watcher.fd);
650     handle->event_watcher.fd = -1;
651   }
652 
653   uv__free(handle->path);
654   handle->path = NULL;
655 
656   return r;
657 }
658 
659 
uv__fs_event_close(uv_fs_event_t * handle)660 void uv__fs_event_close(uv_fs_event_t* handle) {
661   uv_fs_event_stop(handle);
662 }
663