MySQL 5.6.14 Source Code Document
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
WatchDog.cpp
1 /*
2  Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
3 
4  This program is free software; you can redistribute it and/or modify
5  it under the terms of the GNU General Public License as published by
6  the Free Software Foundation; version 2 of the License.
7 
8  This program is distributed in the hope that it will be useful,
9  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  GNU General Public License for more details.
12 
13  You should have received a copy of the GNU General Public License
14  along with this program; if not, write to the Free Software
15  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17 
18 
19 #include <ndb_global.h>
20 
21 #include "WatchDog.hpp"
22 #include "GlobalData.hpp"
23 #include <NdbOut.hpp>
24 #include <NdbSleep.h>
25 #include <ErrorHandlingMacros.hpp>
26 #include <Configuration.hpp>
27 #include <EventLogger.hpp>
28 
29 #include <NdbTick.h>
30 
31 extern EventLogger * g_eventLogger;
32 
33 extern "C"
34 void*
35 runWatchDog(void* w){
36  ((WatchDog*)w)->run();
37  return NULL;
38 }
39 
40 WatchDog::WatchDog(Uint32 interval) :
41  m_watchedCount(0)
42 {
43  setCheckInterval(interval);
44  m_mutex = NdbMutex_Create();
45  theStop = false;
46  theThreadPtr = 0;
47 }
48 
49 WatchDog::~WatchDog(){
50  doStop();
51  NdbMutex_Destroy(m_mutex);
52 }
53 
54 Uint32
55 WatchDog::setCheckInterval(Uint32 interval){
56  // An interval of less than 70ms is not acceptable
57  return theInterval = (interval < 70 ? 70 : interval);
58 }
59 
60 bool
61 WatchDog::registerWatchedThread(Uint32 *counter, Uint32 threadId)
62 {
63  bool ret;
64 
65  NdbMutex_Lock(m_mutex);
66 
67  if (m_watchedCount >= MAX_WATCHED_THREADS)
68  {
69  ret = false;
70  }
71  else
72  {
73  m_watchedList[m_watchedCount].m_watchCounter = counter;
74  m_watchedList[m_watchedCount].m_threadId = threadId;
75  NdbTick_getMicroTimer(&(m_watchedList[m_watchedCount].m_startTime));
76  m_watchedList[m_watchedCount].m_slowWarnDelay = theInterval;
77  m_watchedList[m_watchedCount].m_lastCounterValue = 0;
78  ++m_watchedCount;
79  ret = true;
80  }
81 
82  NdbMutex_Unlock(m_mutex);
83  return ret;
84 }
85 
86 void
87 WatchDog::unregisterWatchedThread(Uint32 threadId)
88 {
89  Uint32 i;
90  NdbMutex_Lock(m_mutex);
91 
92  for (i = 0; i < m_watchedCount; i++)
93  {
94  if (threadId == m_watchedList[i].m_threadId)
95  break;
96  }
97  assert(i < m_watchedCount);
98  m_watchedList[i] = m_watchedList[m_watchedCount - 1];
99  --m_watchedCount;
100 
101  NdbMutex_Unlock(m_mutex);
102 }
103 
104 struct NdbThread*
105 WatchDog::doStart()
106 {
107  theStop = false;
108  theThreadPtr = NdbThread_Create(runWatchDog,
109  (void**)this,
110  0, // default stack size
111  "ndb_watchdog",
112  NDB_THREAD_PRIO_HIGH);
113 
114  return theThreadPtr;
115 }
116 
117 void
118 WatchDog::doStop(){
119  void *status;
120  theStop = true;
121  if(theThreadPtr){
122  NdbThread_WaitFor(theThreadPtr, &status);
123  NdbThread_Destroy(&theThreadPtr);
124  }
125 }
126 
127 const char *get_action(Uint32 IPValue)
128 {
129  const char *action;
130  switch (IPValue) {
131  case 1:
132  action = "Job Handling";
133  break;
134  case 2:
135  action = "Scanning Timers";
136  break;
137  case 3:
138  action = "External I/O";
139  break;
140  case 4:
141  action = "Print Job Buffers at crash";
142  break;
143  case 5:
144  action = "Checking connections";
145  break;
146  case 6:
147  action = "Performing Send";
148  break;
149  case 7:
150  action = "Polling for Receive";
151  break;
152  case 8:
153  action = "Performing Receive";
154  break;
155  case 9:
156  action = "Allocating memory";
157  break;
158  default:
159  action = "Unknown place";
160  break;
161  }//switch
162  return action;
163 }
164 
165 
166 #ifdef _WIN32
167 struct tms {
168  clock_t tms_utime; /* user time */
169  clock_t tms_stime; /* system time */
170  clock_t tms_cutime; /* user time of children */
171  clock_t tms_cstime; /* system time of children */
172 };
173 
174 static clock_t
175 times(struct tms *buf)
176 {
177  if (!buf)
178  {
179  errno = EINVAL;
180  return -1;
181  }
182 
183  FILETIME create, exit, kernel, user;
184  if (GetProcessTimes(GetCurrentProcess(),
185  &create, &exit, &kernel, &user) == 0)
186  {
187  errno = GetLastError();
188  return -1;
189  }
190 
191  ULARGE_INTEGER ulint;
192  ulint.LowPart = kernel.dwLowDateTime;
193  ulint.HighPart = kernel.dwHighDateTime;
194  buf->tms_stime = (clock_t)ulint.QuadPart;
195  buf->tms_cstime = (clock_t)ulint.QuadPart;
196 
197  ulint.LowPart = user.dwLowDateTime;
198  ulint.HighPart = user.dwHighDateTime;
199  buf->tms_utime = (clock_t)ulint.QuadPart;
200  buf->tms_cutime = (clock_t)ulint.QuadPart;
201 
202  LARGE_INTEGER ticks;
203  if (QueryPerformanceCounter(&ticks) == 0)
204  {
205  errno = GetLastError();
206  return -1;
207  }
208 
209  return (clock_t)ticks.QuadPart;
210 }
211 
212 
213 #else
214 #include <sys/times.h>
215 #endif
216 
217 void
218 WatchDog::run()
219 {
220  unsigned int sleep_time;
221  struct MicroSecondTimer last_time, now;
222  Uint32 numThreads;
223  Uint32 counterValue[MAX_WATCHED_THREADS];
224  Uint32 oldCounterValue[MAX_WATCHED_THREADS];
225  Uint32 threadId[MAX_WATCHED_THREADS];
226  struct MicroSecondTimer start_time[MAX_WATCHED_THREADS];
227  Uint32 theIntervalCheck[MAX_WATCHED_THREADS];
228  Uint32 elapsed[MAX_WATCHED_THREADS];
229 
230  NdbTick_getMicroTimer(&last_time);
231 
232  // WatchDog for the single threaded NDB
233  while (!theStop)
234  {
235  sleep_time= 100;
236 
237  NdbSleep_MilliSleep(sleep_time);
238  if(theStop)
239  break;
240 
241  NdbTick_getMicroTimer(&now);
242  if (NdbTick_getMicrosPassed(last_time, now)/1000 > sleep_time*2)
243  {
244  struct tms my_tms;
245  times(&my_tms);
246  g_eventLogger->info("Watchdog: User time: %llu System time: %llu",
247  (Uint64)my_tms.tms_utime,
248  (Uint64)my_tms.tms_stime);
249  g_eventLogger->warning("Watchdog: Warning overslept %llu ms, expected %u ms.",
250  NdbTick_getMicrosPassed(last_time, now)/1000,
251  sleep_time);
252  }
253  last_time = now;
254 
255  /*
256  Copy out all active counters under locked mutex, then check them
257  afterwards without holding the mutex.
258  */
259  NdbMutex_Lock(m_mutex);
260  numThreads = m_watchedCount;
261  for (Uint32 i = 0; i < numThreads; i++)
262  {
263  counterValue[i] = *(m_watchedList[i].m_watchCounter);
264  if (counterValue[i] != 0)
265  {
266  /*
267  The thread responded since last check, so just update state until
268  next check.
269 
270  There is a small race here. If the thread changes the counter
271  in-between the read and setting to zero here in the watchdog
272  thread, then gets stuck immediately after, we may report the
273  wrong action that it got stuck on.
274  But there will be no reporting of non-stuck thread because of
275  this race, nor will there be missed reporting.
276  */
277  *(m_watchedList[i].m_watchCounter) = 0;
278  m_watchedList[i].m_startTime = now;
279  m_watchedList[i].m_slowWarnDelay = theInterval;
280  m_watchedList[i].m_lastCounterValue = counterValue[i];
281  }
282  else
283  {
284  start_time[i] = m_watchedList[i].m_startTime;
285  threadId[i] = m_watchedList[i].m_threadId;
286  oldCounterValue[i] = m_watchedList[i].m_lastCounterValue;
287  theIntervalCheck[i] = m_watchedList[i].m_slowWarnDelay;
288  elapsed[i] = (Uint32)NdbTick_getMicrosPassed(start_time[i], now)/1000;
289  if (oldCounterValue[i] == 9 && elapsed[i] >= theIntervalCheck[i])
290  m_watchedList[i].m_slowWarnDelay += theInterval;
291  }
292  }
293  NdbMutex_Unlock(m_mutex);
294 
295  /*
296  Now check each watched thread if it has reported progress since previous
297  check. Warn about any stuck threads, and eventually force shutdown the
298  server.
299  */
300  for (Uint32 i = 0; i < numThreads; i++)
301  {
302  if (counterValue[i] != 0)
303  continue;
304 
305  /*
306  Counter value == 9 indicates malloc going on, this can take some time
307  so only warn if we pass the watchdog interval
308  */
309  if (oldCounterValue[i] != 9 || elapsed[i] >= theIntervalCheck[i])
310  {
311  const char *last_stuck_action = get_action(oldCounterValue[i]);
312  g_eventLogger->warning("Ndb kernel thread %u is stuck in: %s "
313  "elapsed=%u",
314  threadId[i], last_stuck_action, elapsed[i]);
315  {
316  struct tms my_tms;
317  times(&my_tms);
318  g_eventLogger->info("Watchdog: User time: %llu System time: %llu",
319  (Uint64)my_tms.tms_utime,
320  (Uint64)my_tms.tms_stime);
321  }
322  if (elapsed[i] > 3 * theInterval)
323  {
324  shutdownSystem(last_stuck_action);
325  }
326  }
327  }
328  }
329  return;
330 }
331 
332 void
333 WatchDog::shutdownSystem(const char *last_stuck_action){
334 
335  ErrorReporter::handleError(NDBD_EXIT_WATCHDOG_TERMINATE,
336  last_stuck_action,
337  __FILE__,
338  NST_Watchdog);
339 }