MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
epoll.c
1 /*
2  * Copyright 2000-2003 Niels Provos <provos@citi.umich.edu>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. The name of the author may not be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 
31 #include <stdint.h>
32 #include <sys/types.h>
33 #include <sys/resource.h>
34 #ifdef HAVE_SYS_TIME_H
35 #include <sys/time.h>
36 #else
37 #include <sys/_time.h>
38 #endif
39 #include <sys/queue.h>
40 #include <sys/epoll.h>
41 #include <signal.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include <errno.h>
47 #ifdef HAVE_FCNTL_H
48 #include <fcntl.h>
49 #endif
50 
51 #include "event.h"
52 #include "event-internal.h"
53 #include "evsignal.h"
54 #include "log.h"
55 
56 /* due to limitations in the epoll interface, we need to keep track of
57  * all file descriptors outself.
58  */
59 struct evepoll {
60  struct event *evread;
61  struct event *evwrite;
62 };
63 
64 struct epollop {
65  struct evepoll *fds;
66  int nfds;
67  struct epoll_event *events;
68  int nevents;
69  int epfd;
70 };
71 
72 static void *epoll_init (struct event_base *);
73 static int epoll_add (void *, struct event *);
74 static int epoll_del (void *, struct event *);
75 static int epoll_dispatch (struct event_base *, void *, struct timeval *);
76 static void epoll_dealloc (struct event_base *, void *);
77 
78 const struct eventop epollops = {
79  "epoll",
80  epoll_init,
81  epoll_add,
82  epoll_del,
83  epoll_dispatch,
84  epoll_dealloc,
85  1 /* need reinit */
86 };
87 
88 #ifdef HAVE_SETFD
89 #define FD_CLOSEONEXEC(x) do { \
90  if (fcntl(x, F_SETFD, 1) == -1) \
91  event_warn("fcntl(%d, F_SETFD)", x); \
92 } while (0)
93 #else
94 #define FD_CLOSEONEXEC(x)
95 #endif
96 
97 #define NEVENT 32000
98 
99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
100  * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
101  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
102  * largest number of msec we can support here is 2147482. Let's
103  * round that down by 47 seconds.
104  */
105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
106 
107 static void *
108 epoll_init(struct event_base *base)
109 {
110  int epfd, nfiles = NEVENT;
111  struct rlimit rl;
112  struct epollop *epollop;
113 
114  /* Disable epollueue when this environment variable is set */
115  if (getenv("EVENT_NOEPOLL"))
116  return (NULL);
117 
118  if (getrlimit(RLIMIT_NOFILE, &rl) == 0 &&
119  rl.rlim_cur != RLIM_INFINITY) {
120  /*
121  * Solaris is somewhat retarded - it's important to drop
122  * backwards compatibility when making changes. So, don't
123  * dare to put rl.rlim_cur here.
124  */
125  nfiles = rl.rlim_cur - 1;
126  }
127 
128  /* Initalize the kernel queue */
129 
130  if ((epfd = epoll_create(nfiles)) == -1) {
131  if (errno != ENOSYS)
132  event_warn("epoll_create");
133  return (NULL);
134  }
135 
136  FD_CLOSEONEXEC(epfd);
137 
138  if (!(epollop = calloc(1, sizeof(struct epollop))))
139  return (NULL);
140 
141  epollop->epfd = epfd;
142 
143  /* Initalize fields */
144  epollop->events = malloc(nfiles * sizeof(struct epoll_event));
145  if (epollop->events == NULL) {
146  free(epollop);
147  return (NULL);
148  }
149  epollop->nevents = nfiles;
150 
151  epollop->fds = calloc(nfiles, sizeof(struct evepoll));
152  if (epollop->fds == NULL) {
153  free(epollop->events);
154  free(epollop);
155  return (NULL);
156  }
157  epollop->nfds = nfiles;
158 
159  evsignal_init(base);
160 
161  return (epollop);
162 }
163 
164 static int
165 epoll_recalc(struct event_base *base, void *arg, int max)
166 {
167  struct epollop *epollop = arg;
168 
169  if (max >= epollop->nfds) {
170  struct evepoll *fds;
171  int nfds;
172 
173  nfds = epollop->nfds;
174  while (nfds <= max)
175  nfds <<= 1;
176 
177  fds = realloc(epollop->fds, nfds * sizeof(struct evepoll));
178  if (fds == NULL) {
179  event_warn("realloc");
180  return (-1);
181  }
182  epollop->fds = fds;
183  memset(fds + epollop->nfds, 0,
184  (nfds - epollop->nfds) * sizeof(struct evepoll));
185  epollop->nfds = nfds;
186  }
187 
188  return (0);
189 }
190 
191 static int
192 epoll_dispatch(struct event_base *base, void *arg, struct timeval *tv)
193 {
194  struct epollop *epollop = arg;
195  struct epoll_event *events = epollop->events;
196  struct evepoll *evep;
197  int i, res, timeout = -1;
198 
199  if (tv != NULL)
200  timeout = tv->tv_sec * 1000 + (tv->tv_usec + 999) / 1000;
201 
202  if (timeout > MAX_EPOLL_TIMEOUT_MSEC) {
203  /* Linux kernels can wait forever if the timeout is too big;
204  * see comment on MAX_EPOLL_TIMEOUT_MSEC. */
205  timeout = MAX_EPOLL_TIMEOUT_MSEC;
206  }
207 
208  res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
209 
210  if (res == -1) {
211  if (errno != EINTR) {
212  event_warn("epoll_wait");
213  return (-1);
214  }
215 
216  evsignal_process(base);
217  return (0);
218  } else if (base->sig.evsignal_caught) {
219  evsignal_process(base);
220  }
221 
222  event_debug(("%s: epoll_wait reports %d", __func__, res));
223 
224  for (i = 0; i < res; i++) {
225  int what = events[i].events;
226  struct event *evread = NULL, *evwrite = NULL;
227  int fd = events[i].data.fd;
228 
229  if (fd < 0 || fd >= epollop->nfds)
230  continue;
231  evep = &epollop->fds[fd];
232 
233  if (what & (EPOLLHUP|EPOLLERR)) {
234  evread = evep->evread;
235  evwrite = evep->evwrite;
236  } else {
237  if (what & EPOLLIN) {
238  evread = evep->evread;
239  }
240 
241  if (what & EPOLLOUT) {
242  evwrite = evep->evwrite;
243  }
244  }
245 
246  if (!(evread||evwrite))
247  continue;
248 
249  if (evread != NULL)
250  event_active(evread, EV_READ, 1);
251  if (evwrite != NULL)
252  event_active(evwrite, EV_WRITE, 1);
253  }
254 
255  return (0);
256 }
257 
258 
259 static int
260 epoll_add(void *arg, struct event *ev)
261 {
262  struct epollop *epollop = arg;
263  struct epoll_event epev = {0, {0}};
264  struct evepoll *evep;
265  int fd, op, events;
266 
267  if (ev->ev_events & EV_SIGNAL)
268  return (evsignal_add(ev));
269 
270  fd = ev->ev_fd;
271  if (fd >= epollop->nfds) {
272  /* Extent the file descriptor array as necessary */
273  if (epoll_recalc(ev->ev_base, epollop, fd) == -1)
274  return (-1);
275  }
276  evep = &epollop->fds[fd];
277  op = EPOLL_CTL_ADD;
278  events = 0;
279  if (evep->evread != NULL) {
280  events |= EPOLLIN;
281  op = EPOLL_CTL_MOD;
282  }
283  if (evep->evwrite != NULL) {
284  events |= EPOLLOUT;
285  op = EPOLL_CTL_MOD;
286  }
287 
288  if (ev->ev_events & EV_READ)
289  events |= EPOLLIN;
290  if (ev->ev_events & EV_WRITE)
291  events |= EPOLLOUT;
292 
293  epev.data.fd = fd;
294  epev.events = events;
295  if (epoll_ctl(epollop->epfd, op, ev->ev_fd, &epev) == -1)
296  return (-1);
297 
298  /* Update events responsible */
299  if (ev->ev_events & EV_READ)
300  evep->evread = ev;
301  if (ev->ev_events & EV_WRITE)
302  evep->evwrite = ev;
303 
304  return (0);
305 }
306 
307 static int
308 epoll_del(void *arg, struct event *ev)
309 {
310  struct epollop *epollop = arg;
311  struct epoll_event epev = {0, {0}};
312  struct evepoll *evep;
313  int fd, events, op;
314  int needwritedelete = 1, needreaddelete = 1;
315 
316  if (ev->ev_events & EV_SIGNAL)
317  return (evsignal_del(ev));
318 
319  fd = ev->ev_fd;
320  if (fd >= epollop->nfds)
321  return (0);
322  evep = &epollop->fds[fd];
323 
324  op = EPOLL_CTL_DEL;
325  events = 0;
326 
327  if (ev->ev_events & EV_READ)
328  events |= EPOLLIN;
329  if (ev->ev_events & EV_WRITE)
330  events |= EPOLLOUT;
331 
332  if ((events & (EPOLLIN|EPOLLOUT)) != (EPOLLIN|EPOLLOUT)) {
333  if ((events & EPOLLIN) && evep->evwrite != NULL) {
334  needwritedelete = 0;
335  events = EPOLLOUT;
336  op = EPOLL_CTL_MOD;
337  } else if ((events & EPOLLOUT) && evep->evread != NULL) {
338  needreaddelete = 0;
339  events = EPOLLIN;
340  op = EPOLL_CTL_MOD;
341  }
342  }
343 
344  epev.events = events;
345  epev.data.fd = fd;
346 
347  if (needreaddelete)
348  evep->evread = NULL;
349  if (needwritedelete)
350  evep->evwrite = NULL;
351 
352  if (epoll_ctl(epollop->epfd, op, fd, &epev) == -1)
353  return (-1);
354 
355  return (0);
356 }
357 
358 static void
359 epoll_dealloc(struct event_base *base, void *arg)
360 {
361  struct epollop *epollop = arg;
362 
363  evsignal_dealloc(base);
364  if (epollop->fds)
365  free(epollop->fds);
366  if (epollop->events)
367  free(epollop->events);
368  if (epollop->epfd >= 0)
369  close(epollop->epfd);
370 
371  memset(epollop, 0, sizeof(struct epollop));
372  free(epollop);
373 }