17 #include <ndb_global.h> 
   18 #include <ndb_version.h> 
   23 #include <NdbConfig.h> 
   24 #include <NdbAutoPtr.hpp> 
   25 #include <portlib/ndb_daemon.h> 
   26 #include <portlib/NdbSleep.h> 
   27 #include <portlib/NdbDir.hpp> 
   29 #include <ConfigRetriever.hpp> 
   31 #include <EventLogger.hpp> 
   37   ndb_daemon_exit(code);
 
   40 #include "../mgmapi/mgmapi_configuration.hpp" 
   44                NodeId nodeid, 
int error_exit,
 
   45                bool restart, 
bool nostart, 
bool initial,
 
   46                Uint32 error, Uint32 signum, Uint32 sphase)
 
   50          (!restart && !initial && !nostart));
 
   52   Uint32 length, theData[25];
 
   55   rep->setNodeId(nodeid);
 
   79   g_eventLogger->
log(rep->getEventType(), theData, length,
 
   84   for (iter.first(); iter.valid(); iter.next())
 
   87     if (iter.get(CFG_TYPE_OF_SECTION, &type) ||
 
   88        type != NODE_TYPE_MGM)
 
   92     if (iter.get(CFG_MGM_PORT, &port))
 
   96     if (iter.get(CFG_NODE_HOST, &hostname))
 
  100     connect_str.
assfmt(
"%s:%d", hostname, port);
 
  106       g_eventLogger->
warning(
"Unable to report shutdown reason " 
  107                              "to '%s'(failed to create mgm handle)",
 
  108                              connect_str.
c_str());
 
  114         ndb_mgm_report_event(h, theData, length))
 
  116       g_eventLogger->
warning(
"Unable to report shutdown reason " 
  117                              "to '%s'(error: %s - %s)",
 
  131   static const int ignore_list[] = {
 
  141 #elif defined SIGINFO 
  177   for(
size_t i = 0; 
i < 
sizeof(ignore_list)/
sizeof(ignore_list[0]); 
i++)
 
  178     signal(ignore_list[
i], SIG_IGN);
 
  183 int pipe(
int pipefd[2]){
 
  184   const unsigned int buffer_size = 4096;
 
  186   return _pipe(pipefd, buffer_size, flags);
 
  194 static const int WNOHANG = 37;
 
  197 pid_t waitpid(pid_t pid, 
int *stat_loc, 
int options)
 
  200   assert(options == WNOHANG);
 
  203   HANDLE handle = OpenProcess(PROCESS_ALL_ACCESS, FALSE, pid);
 
  206     g_eventLogger->
error(
"waitpid: Could not open handle for pid %d, " 
  207                          "error: %d", pid, GetLastError());
 
  212   if (!GetExitCodeProcess(handle, &exit_code))
 
  214     g_eventLogger->
error(
"waitpid: GetExitCodeProcess failed, pid: %d, " 
  215                          "error: %d", pid, GetLastError());
 
  221   if (exit_code == STILL_ACTIVE)
 
  227   *stat_loc = exit_code;
 
  233 bool WIFEXITED(
int status)
 
  239 int WEXITSTATUS(
int status)
 
  245 bool WIFSIGNALED(
int status)
 
  251 int WTERMSIG(
int status)
 
  257 kill(pid_t pid, 
int sig)
 
  259   int retry_open_event = 10;
 
  261   char shutdown_event_name[32];
 
  262   _snprintf(shutdown_event_name, 
sizeof(shutdown_event_name),
 
  263             "ndbd_shutdown_%d", pid);
 
  266   HANDLE shutdown_event;
 
  267   while ((shutdown_event =
 
  268           OpenEvent(EVENT_MODIFY_STATE, FALSE, shutdown_event_name)) == NULL)
 
  275     HANDLE process = OpenProcess(SYNCHRONIZE | PROCESS_QUERY_INFORMATION,
 
  283     if (!GetExitCodeProcess(process,&exit_code))
 
  285       g_eventLogger->
error(
"GetExitCodeProcess failed, pid: %d, error: %d",
 
  286                            pid, GetLastError());
 
  287       CloseHandle(process);
 
  290     CloseHandle(process);
 
  292     if (exit_code != STILL_ACTIVE)
 
  298     if (retry_open_event--)
 
  302       g_eventLogger->
error(
"Failed to open shutdown_event '%s', error: %d",
 
  303                             shutdown_event_name, GetLastError());
 
  308   if (SetEvent(shutdown_event) == 0)
 
  310     g_eventLogger->
error(
"Failed to signal shutdown_event '%s', error: %d",
 
  311                          shutdown_event_name, GetLastError());
 
  313   CloseHandle(shutdown_event);
 
  318 extern int real_main(
int, 
char**);
 
  324   char **argv = (
char **)malloc(
sizeof(
char*) * (args.size() + 1));
 
  328   for(
unsigned i = 0; 
i < args.size(); 
i++)
 
  329     argv[
i] = strdup(args[
i].c_str());
 
  330   argv[args.size()] = NULL;
 
  336 void free_argv(
char** argv)
 
  354   DWORD len = GetModuleFileName(NULL, path, 
sizeof(path));
 
  355   if (len == 0 || len == 
sizeof(path))
 
  357     g_eventLogger->
warning(
"spawn_process: Could not extract full path, " 
  358                            "len: %u, error: %u\n",
 
  359                            len, GetLastError());
 
  368   char** argv = create_argv(args);
 
  371     g_eventLogger->
error(
"spawn_process: Failed to create argv, errno: %d",
 
  378   intptr_t spawn_handle = _spawnv(P_NOWAIT, progname, argv);
 
  379   if (spawn_handle == -1)
 
  381     g_eventLogger->
error(
"spawn_process: Failed to spawn process, errno: %d",
 
  384     g_eventLogger->
error(
" progname: '%s'", progname);
 
  387       g_eventLogger->
error(
"argv: '%s'", *argp++);
 
  395   DWORD pid = GetProcessId((HANDLE)spawn_handle);
 
  398     g_eventLogger->
error(
"spawn_process: Failed to convert handle %d " 
  399                          "to pid, error: %d", spawn_handle, GetLastError());
 
  400     CloseHandle((HANDLE)spawn_handle);
 
  403   CloseHandle((HANDLE)spawn_handle);
 
  409     g_eventLogger->
error(
"Failed to fork, errno: %d", errno);
 
  428   (void)real_main(argc, argv);
 
  443   const unsigned max_retries = 10;
 
  444   unsigned retry_counter = 0;
 
  447     pid_t pid = spawn_process(progname, args);
 
  450       if (retry_counter++ == max_retries)
 
  452         g_eventLogger->
error(
"Angel failed to spawn %d times, giving up",
 
  457       g_eventLogger->
warning(
"Angel failed to spawn, sleep and retry");
 
  459       NdbSleep_SecSleep(1);
 
  466 static Uint32 stop_on_error;
 
  467 static Uint32 config_max_start_fail_retries;
 
  468 static Uint32 config_restart_delay_secs; 
 
  478   Uint32 generation = 0;
 
  480   if (sys_iter.get(CFG_SYS_CONFIG_GENERATION, &generation))
 
  482     g_eventLogger->
warning(
"Configuration didn't contain generation " 
  483                            "(likely old ndb_mgmd");
 
  485   g_eventLogger->
debug(
"Using configuration with generation %u", generation);
 
  488   if (iter.find(CFG_NODE_ID, nodeid))
 
  490     g_eventLogger->
error(
"Invalid configuration fetched, could not " 
  491                          "find own node id %d", nodeid);
 
  495   if (iter.get(CFG_DB_STOP_ON_ERROR, &stop_on_error))
 
  497     g_eventLogger->
error(
"Invalid configuration fetched, could not " 
  501   g_eventLogger->
debug(
"Using StopOnError: %u", stop_on_error);
 
  503   if (iter.get(CFG_DB_MAX_START_FAIL, &config_max_start_fail_retries))
 
  506     config_max_start_fail_retries = 3;
 
  509   if (iter.get(CFG_DB_START_FAIL_DELAY_SECS, &config_restart_delay_secs))
 
  512     config_restart_delay_secs = 0;
 
  515   const char * datadir;
 
  516   if (iter.get(CFG_NODE_DATADIR, &datadir))
 
  518     g_eventLogger->
error(
"Invalid configuration fetched, could not " 
  522   g_eventLogger->
debug(
"Using DataDir: %s", datadir);
 
  524   NdbConfig_SetPath(datadir);
 
  526   if (NdbDir::chdir(NdbConfig_get_path(NULL)) != 0)
 
  528     g_eventLogger->
warning(
"Cannot change directory to '%s', error: %d",
 
  529                            NdbConfig_get_path(NULL), errno);
 
  536 bool stop_child = 
false;
 
  539 angel_run(
const char* progname,
 
  541           const char* connect_str,
 
  543           const char* bind_address,
 
  553   if (retriever.hasError())
 
  555     g_eventLogger->
error(
"Could not initialize connection to management " 
  556                          "server, error: '%s'", retriever.getErrorString());
 
  560   const int connnect_retries = 12;
 
  561   const int connect_delay = 5;
 
  562   const int verbose = 1;
 
  563   if (retriever.do_connect(connnect_retries, connect_delay, verbose) != 0)
 
  565     g_eventLogger->
error(
"Could not connect to management server, " 
  566                          "error: '%s'", retriever.getErrorString());
 
  569   g_eventLogger->
info(
"Angel connected to '%s:%d'",
 
  570                       retriever.get_mgmd_host(),
 
  571                       retriever.get_mgmd_port());
 
  573   const int alloc_retries = 2;
 
  574   const int alloc_delay = 3;
 
  575   const Uint32 nodeid = retriever.allocNodeId(alloc_retries, alloc_delay);
 
  578     g_eventLogger->
error(
"Failed to allocate nodeid, error: '%s'",
 
  579                          retriever.getErrorString());
 
  582   g_eventLogger->
info(
"Angel allocated nodeid: %u", nodeid);
 
  588     g_eventLogger->
error(
"Could not fetch configuration/invalid " 
  589                          "configuration, error: '%s'",
 
  590                          retriever.getErrorString());
 
  594   if (!configure(config, nodeid))
 
  603     char *lockfile = NdbConfig_PidFileName(nodeid);
 
  604     char *logfile = NdbConfig_StdoutFileName(nodeid);
 
  607     if (ndb_daemonize(lockfile, logfile) != 0)
 
  609       g_eventLogger->
error(
"Couldn't start as daemon, error: '%s'",
 
  616   Uint32 failed_startups_counter = 0;
 
  624       g_eventLogger->
error(
"Failed to create pipe, errno: %d (%s)",
 
  625                            errno, strerror(errno));
 
  630     if (!(child_info_r = fdopen(fds[0], 
"r")))
 
  632       g_eventLogger->
error(
"Failed to open stream for pipe, errno: %d (%s)",
 
  633                            errno, strerror(errno));
 
  642     args = original_args;
 
  646     one_arg.
assfmt(
"--report-fd=%d", fds[1]);
 
  647     args.push_back(one_arg);
 
  650     one_arg.
assfmt(
"--allocated-nodeid=%d", nodeid);
 
  651     args.push_back(one_arg);
 
  653     one_arg.
assfmt(
"--initial=%d", initial);
 
  654     args.push_back(one_arg);
 
  656     one_arg.
assfmt(
"--nostart=%d", no_start);
 
  657     args.push_back(one_arg);
 
  659     pid_t child = retry_spawn_process(progname, args);
 
  663       g_eventLogger->
error(
"retry_spawn_process, child: %d", child);
 
  670     g_eventLogger->
info(
"Angel pid: %d started child: %d",
 
  675     int status=0, error_exit=0;
 
  678       pid_t ret_pid = waitpid(child, &status, WNOHANG);
 
  679       if (ret_pid == child)
 
  681         g_eventLogger->
debug(
"Angel got child %d", child);
 
  686         g_eventLogger->
warning(
"Angel got unexpected pid %d " 
  687                                "when waiting for %d",
 
  693         g_eventLogger->
info(
"Angel shutting down ndbd with pid %d", child);
 
  696       NdbSleep_MilliSleep(100);
 
  704     Uint32 child_error = 0, child_signal = 0, child_sphase = 0;
 
  705     while (fgets(buf, 
sizeof (buf), child_info_r))
 
  708       if (sscanf(buf, 
"error=%d\n", &value) == 1)
 
  710       else if (sscanf(buf, 
"signal=%d\n", &value) == 1)
 
  711         child_signal = value;
 
  712       else if (sscanf(buf, 
"sphase=%d\n", &value) == 1)
 
  713         child_sphase = value;
 
  714       else if (strcmp(buf, 
"\n") != 0)
 
  715         fprintf(stderr, 
"unknown info from child: '%s'\n", buf);
 
  717     g_eventLogger->
debug(
"error: %u, signal: %u, sphase: %u",
 
  718                          child_error, child_signal, child_sphase);
 
  720     fclose(child_info_r);
 
  722     if (WIFEXITED(status))
 
  724       switch (WEXITSTATUS(status)) {
 
  726         g_eventLogger->
info(
"Angel shutting down");
 
  727         reportShutdown(config, nodeid, 0, 0, 
false, 
false,
 
  728                        child_error, child_signal, child_sphase);
 
  731       case NRT_NoStart_Restart:
 
  735       case NRT_NoStart_InitialStart:
 
  739       case NRT_DoStart_InitialStart:
 
  750           reportShutdown(config, nodeid,
 
  751                          error_exit, 0, 
false, 
false,
 
  752                          child_error, child_signal, child_sphase);
 
  756       case NRT_DoStart_Restart:
 
  764       if (WIFSIGNALED(status))
 
  766         child_signal = WTERMSIG(status);
 
  771         g_eventLogger->
info(
"Unknown exit reason. Stopped.");
 
  778         reportShutdown(config, nodeid,
 
  779                        error_exit, 0, 
false, 
false,
 
  780                        child_error, child_signal, child_sphase);
 
  786     const Uint32 STARTUP_FAILURE_SPHASE = 6;
 
  787     Uint32 restart_delay_secs = 0;
 
  789         child_sphase <= STARTUP_FAILURE_SPHASE)
 
  791       if (++failed_startups_counter >= config_max_start_fail_retries)
 
  793         g_eventLogger->
alert(
"Angel detected too many startup failures(%d), " 
  794                              "not restarting again", failed_startups_counter);
 
  795         reportShutdown(config, nodeid,
 
  796                        error_exit, 0, 
false, 
false,
 
  797                        child_error, child_signal, child_sphase);
 
  800       g_eventLogger->
info(
"Angel detected startup failure, count: %u",
 
  801                           failed_startups_counter);
 
  803       restart_delay_secs = config_restart_delay_secs;
 
  808       failed_startups_counter = 0;
 
  811     reportShutdown(config, nodeid,
 
  815                    child_error, child_signal, child_sphase);
 
  816     g_eventLogger->
info(
"Ndb has terminated (pid %d) restarting", child);
 
  818     g_eventLogger->
debug(
"Angel reconnecting to management server");
 
  819     (void)retriever.disconnect();
 
  821     if (restart_delay_secs > 0)
 
  823       g_eventLogger->
info(
"Delaying Ndb restart for %u seconds.",
 
  825       NdbSleep_SecSleep(restart_delay_secs);
 
  828     const int connnect_retries = 12;
 
  829     const int connect_delay = 5;
 
  830     const int verbose = 1;
 
  831     if (retriever.do_connect(connnect_retries, connect_delay, verbose) != 0)
 
  833       g_eventLogger->
error(
"Could not connect to management server, " 
  834                            "error: '%s'", retriever.getErrorString());
 
  837     g_eventLogger->
info(
"Angel reconnected to '%s:%d'",
 
  838                         retriever.get_mgmd_host(),
 
  839                         retriever.get_mgmd_port());
 
  842     retriever.setNodeId(nodeid);
 
  844     g_eventLogger->
debug(
"Angel reallocating nodeid %d", nodeid);
 
  845     const int alloc_retries = 10;
 
  846     const int alloc_delay = 3;
 
  847     const Uint32 realloced = retriever.allocNodeId(alloc_retries, alloc_delay);
 
  850       g_eventLogger->
error(
"Angel failed to allocate nodeid, error: '%s'",
 
  851                            retriever.getErrorString());
 
  854     if (realloced != nodeid)
 
  856       g_eventLogger->
error(
"Angel failed to reallocate nodeid %d, got %d",
 
  860     g_eventLogger->
info(
"Angel reallocated nodeid: %u", nodeid);
 
  871 void angel_stop(
void)