tgstation-server 6.19.1
The /tg/station 13 server suite
Loading...
Searching...
No Matches
WatchdogBase.cs
Go to the documentation of this file.
1using System;
4using System.Linq;
7
9
10using Prometheus;
11
12using Serilog.Context;
13
30
32{
36#pragma warning disable CA1506 // TODO: Decomplexify
38 {
41
44
46 public uint? ClientCount { get; private set; }
47
49 public DateTimeOffset? LaunchTime => GetActiveController()?.LaunchTime;
50
53 {
54 get => status;
55 protected set
56 {
58 status = value;
59 Logger.LogTrace("Status set from {oldStatus} to {status}", oldStatus, status);
60 }
61 }
62
65
67 public abstract bool AlphaIsActive { get; }
68
71
73 public DreamDaemonLaunchParameters? LastLaunchParameters { get; protected set; }
74
76 public Models.CompileJob? ActiveCompileJob => GetActiveController()?.CompileJob;
77
79 public abstract RebootState? RebootState { get; }
80
85
89 protected ILogger<WatchdogBase> Logger { get; }
90
94 protected IChatManager Chat { get; }
95
100
104 protected IDmbFactory DmbFactory { get; }
105
109 protected IAsyncDelayer AsyncDelayer { get; }
110
114 protected IIOManager GameIOManager { get; }
115
120
125
130
135
140
145
150
155
160
165
170
175
179 volatile TaskCompletionSource activeParametersUpdated;
180
185
190
195
200
205
210
230 protected WatchdogBase(
231 IChatManager chat,
232 ISessionControllerFactory sessionControllerFactory,
233 IDmbFactory dmbFactory,
236 IServerControl serverControl,
237 IAsyncDelayer asyncDelayer,
241 IMetricFactory metricFactory,
242 IIOManager gameIOManager,
245 Api.Models.Instance metadata,
246 bool autoStart)
247 {
248 Chat = chat ?? throw new ArgumentNullException(nameof(chat));
249 SessionControllerFactory = sessionControllerFactory ?? throw new ArgumentNullException(nameof(sessionControllerFactory));
250 DmbFactory = dmbFactory ?? throw new ArgumentNullException(nameof(dmbFactory));
253 AsyncDelayer = asyncDelayer ?? throw new ArgumentNullException(nameof(asyncDelayer));
257 ArgumentNullException.ThrowIfNull(metricFactory);
258 GameIOManager = gameIOManager ?? throw new ArgumentNullException(nameof(gameIOManager));
259 Logger = logger ?? throw new ArgumentNullException(nameof(logger));
263
264 ArgumentNullException.ThrowIfNull(serverControl);
265
266 watchdogStatusMetric = metricFactory.CreateGauge(
267 "tgs_watchdog_status",
268 $"TGS Watchdog status: {(int)WatchdogStatus.Offline} = Offline, {(int)WatchdogStatus.Online} = Online, {(int)WatchdogStatus.Restoring} = Restoring, {(int)WatchdogStatus.DelayedRestart} = Delayed Restart");
269 cpuUsageMetric = metricFactory.CreateGauge("tgs_game_cpu_usage", "Estimated total CPU usage time for the game process from 0-1");
270 ramUsageMetric = metricFactory.CreateGauge("tgs_game_ram_usage", "Total used bytes of private memory for the game process");
271
272 chat.RegisterCommandHandler(this);
273
275 releaseServers = false;
276 activeParametersUpdated = new TaskCompletionSource();
277
278 restartRegistration = serverControl.RegisterForRestart(this);
279 try
280 {
283 }
284 catch
285 {
286 restartRegistration.Dispose();
287 synchronizationSemaphore?.Dispose();
288 throw;
289 }
290
291 Logger.LogTrace("Created watchdog");
292 }
293
296 {
297 Logger.LogTrace("Disposing...");
298 synchronizationSemaphore.Dispose();
299 restartRegistration.Dispose();
300
303 monitorCts?.Dispose();
304
305 disposed = true;
306 }
307
310 {
311 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
312 {
316 if (!currentEngine.HasValue)
317 return false;
318
319 bool match = launchParameters.CanApplyWithoutReboot(currentLaunchParameters, currentEngine.Value);
320 if (match || Status == WatchdogStatus.Offline || Status == WatchdogStatus.DelayedRestart)
321 return false;
322
323 var oldTcs = Interlocked.Exchange(ref activeParametersUpdated, new TaskCompletionSource());
324 oldTcs.SetResult();
325 }
326
327 return true;
328 }
329
332 {
333 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
334 {
338 if (Status != WatchdogStatus.Online || activeServer == null)
339 return new MessageContent
340 {
341 Text = "TGS: Server offline!",
342 };
343
344 var commandResult = await activeServer.SendCommand(command, cancellationToken);
345
346 if (commandResult == null)
347 return new MessageContent
348 {
349 Text = "TGS: Bad topic exchange!",
350 };
351
352 if (commandResult == null)
353 return new MessageContent
354 {
355 Text = "TGS: Bad topic response!",
356 };
357
359 {
360 Text = commandResult.CommandResponse?.Text ?? commandResult.CommandResponseMessage,
361 Embed = commandResult.CommandResponse?.Embed,
362 };
363
364 if (commandResponse.Text == null && commandResponse.Embed == null)
365 {
366 commandResponse.Text = "TGS: Command processed but no DMAPI response returned!";
367 }
368
370
371 return commandResponse;
372 }
373 }
374
376 public async ValueTask Launch(CancellationToken cancellationToken)
377 {
378 if (Status != WatchdogStatus.Offline)
379 throw new JobException(ErrorCode.WatchdogRunning);
380 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
381 await LaunchNoLock(true, true, true, null, cancellationToken);
382 }
383
385 public virtual async ValueTask ResetRebootState(CancellationToken cancellationToken)
386 {
387 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
388 {
389 if (Status == WatchdogStatus.Offline)
390 return;
392 if (toClear != null)
393 await toClear.SetRebootState(Session.RebootState.Normal, cancellationToken);
394 }
395 }
396
398 public async ValueTask Restart(bool graceful, CancellationToken cancellationToken)
399 {
400 if (Status == WatchdogStatus.Offline)
401 throw new JobException(ErrorCode.WatchdogNotRunning);
402
403 Logger.LogTrace("Begin Restart. Graceful: {gracefulFlag}", graceful);
404 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
405 {
406 if (!graceful)
407 {
408 Chat.QueueWatchdogMessage("Manual restart triggered...");
409 await TerminateNoLock(false, false, cancellationToken);
410 await LaunchNoLock(true, false, true, null, cancellationToken);
411 return;
412 }
413
415 if (toReboot != null
416 && !await toReboot.SetRebootState(Session.RebootState.Restart, cancellationToken))
417 Logger.LogWarning("Unable to send reboot state change event!");
418 }
419 }
420
422 public async Task StartAsync(CancellationToken cancellationToken)
423 {
424 var reattachInfo = await SessionPersistor.Load(cancellationToken);
425 var reattaching = reattachInfo != null;
426 if (!autoStart && !reattaching)
427 return;
428
429 var job = Models.Job.Create(
431 ? JobCode.StartupWatchdogReattach
432 : JobCode.StartupWatchdogLaunch,
433 null,
434 metadata,
435 DreamDaemonRights.Shutdown);
437 job,
438 async (core, databaseContextFactory, paramJob, progressFunction, ct) =>
439 {
440 if (core?.Watchdog != this)
442
444 await LaunchNoLock(true, true, true, reattachInfo, ct);
445
447 },
448 cancellationToken);
449 }
450
452 public async Task StopAsync(CancellationToken cancellationToken) =>
453 await TerminateNoLock(false, !releaseServers, cancellationToken);
454
456 public async ValueTask Terminate(bool graceful, CancellationToken cancellationToken)
457 {
458 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
459 await TerminateNoLock(graceful, !releaseServers, cancellationToken);
460 }
461
464 {
466 {
467 await Terminate(true, cancellationToken);
468
469 if (Status != WatchdogStatus.Offline)
470 {
471 Logger.LogDebug("Waiting for server to gracefully shut down.");
472 await monitorTask!.WaitAsync(cancellationToken);
473 }
474 else
475 Logger.LogTrace("Graceful shutdown requested but server is already offline.");
476
477 return;
478 }
479
480 releaseServers = true;
481 if (Status == WatchdogStatus.Online)
482 Chat.QueueWatchdogMessage("Detaching...");
483 else
484 Logger.LogTrace("Not sending detach chat message as status is: {status}", Status);
485 }
486
488 public abstract ValueTask InstanceRenamed(string newInstanceName, CancellationToken cancellationToken);
489
491 public async ValueTask CreateDump(CancellationToken cancellationToken)
492 {
493 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
494 await CreateDumpNoLock(cancellationToken);
495 }
496
498 public async ValueTask<bool> Broadcast(string message, CancellationToken cancellationToken)
499 {
500 ArgumentNullException.ThrowIfNull(message);
501
503 if (activeServer == null)
504 {
505 Logger.LogInformation("Attempted broadcast failed, no active server!");
506 return false;
507 }
508
509 if (!activeServer.DMApiAvailable)
510 {
511 Logger.LogInformation("Attempted broadcast failed, no DMAPI!");
512 return false;
513 }
514
515 var minimumRequiredVersion = new Version(5, 7, 0);
516 if (activeServer.DMApiVersion < minimumRequiredVersion)
517 {
518 Logger.LogInformation(
519 "Attempted broadcast failed, insufficient interop version: {interopVersion}. Requires {minimumRequiredVersion}!",
520 activeServer.DMApiVersion,
522 return false;
523 }
524
525 Logger.LogInformation("Broadcasting: {message}", message);
526
527 var response = await activeServer.SendCommand(
529 cancellationToken);
530
531 return response != null && response.ErrorMessage == null;
532 }
533
535 public void RunMetricsScrape()
536 {
540 cpuUsageMetric.Set(controller?.MeasureProcessorTimeDelta() ?? 0);
541 }
542
545 {
547
549 {
550 Logger.LogDebug("Not sending sensitive event parameters");
551 parameters = Enumerable.Empty<string>();
552 }
553
554 // Method explicitly implemented to prevent accidental calls when this.eventConsumer should be used.
556
557 // Server may have ended
558 if (activeServer == null)
559 return;
560
562 var result = await activeServer.SendCommand(
564 cancellationToken);
565
567 }
568
571 => throw new NotSupportedException("Watchdogs do not support custom events!");
572
581
592 bool startMonitor,
593 bool announce,
594 bool announceFailure,
596 CancellationToken cancellationToken)
597 {
598 Logger.LogTrace("Begin LaunchImplNoLock");
599 if (startMonitor && Status != WatchdogStatus.Offline)
600 throw new JobException(ErrorCode.WatchdogRunning);
601
602 if (reattachInfo == null && !DmbFactory.DmbAvailable)
603 throw new JobException(ErrorCode.WatchdogCompileJobCorrupted);
604
605 // this is necessary, the monitor could be in it's sleep loop trying to restart, if so cancel THAT monitor and start our own with blackjack and hookers
606 var eventTask = ValueTask.CompletedTask;
607 if (announce)
608 {
610 reattachInfo == null
611 ? "Launching..."
612 : "Reattaching..."); // simple announce
613 if (reattachInfo == null)
614 eventTask = HandleEventImpl(EventType.WatchdogLaunch, Enumerable.Empty<string>(), false, cancellationToken);
615 }
616
617 // since neither server is running, this is safe to do
620 ClientCount = null;
621
622 try
623 {
624 await InitController(eventTask, reattachInfo, cancellationToken);
625 }
627 {
628 Logger.LogTrace(ex, "Controller initialization cancelled!");
629 throw;
630 }
631 catch (Exception e)
632 {
633 Logger.LogWarning(e, "Failed to start watchdog!");
636 {
638 if (announceFailure)
639 Chat.QueueWatchdogMessage("Startup failed!");
640 }
641
643 throw;
644 }
645 finally
646 {
647 // finish the chat task that's in flight
648 try
649 {
651 }
653 {
654 Logger.LogTrace(ex, "Announcement task canceled!");
655 }
656 }
657
658 Logger.LogInformation("Controller(s) initialized successfully");
659
660 if (startMonitor)
661 {
664 }
665 }
666
672 {
673 Logger.LogTrace("StopMonitor");
674 if (monitorTask == null)
675 return false;
676 var wasRunning = !monitorTask.IsCompleted;
677 monitorCts!.Cancel();
679 Logger.LogTrace("Stopped Monitor");
680 monitorCts.Dispose();
681 monitorTask = null;
682 monitorCts = null;
683 return wasRunning;
684 }
685
694 {
695 var launchResult = await controller.LaunchResult.WaitAsync(cancellationToken);
696
697 // Dead sessions won't trigger this
698 if (launchResult.ExitCode.HasValue) // you killed us ray...
699 throw new JobException(
700 ErrorCode.WatchdogStartupFailed,
701 new JobException($"{serverName} failed to start: {launchResult}"));
702 if (!launchResult.StartupTime.HasValue)
703 throw new JobException(
704 ErrorCode.WatchdogStartupTimeout,
705 new JobException($"{serverName} timed out on startup: {ActiveLaunchParameters.StartupTimeout!.Value}s"));
706 }
707
714 {
715 // we lost the server, just restart entirely
716 // DCT: Operation must always run
718 ClientCount = null;
719 const string FailReattachMessage = "Unable to properly reattach to server! Restarting watchdog...";
720 Logger.LogWarning(FailReattachMessage);
721
723 await InitController(ValueTask.CompletedTask, null, cancellationToken);
724 }
725
731
738 {
739 Logger.LogTrace("DisposeAndNullControllers");
741 {
743 if (!releaseServers)
744 await SessionPersistor.Clear(cancellationToken);
745 }
746 }
747
753
762 CancellationToken cancellationToken);
763
770 protected async ValueTask BeforeApplyDmb(Models.CompileJob newCompileJob, CancellationToken cancellationToken)
771 {
773 {
774 Logger.LogTrace("Same compile job, not sending deployment event");
775 return;
776 }
777
779 metadata,
781
783 EventType.DeploymentActivation,
784 new List<string?>
785 {
786 GameIOManager.ResolvePath(newCompileJob.DirectoryName!.Value.ToString()),
787 },
788 false,
789 false,
790 cancellationToken);
791
792 try
793 {
794 await remoteDeploymentManager.ApplyDeployment(newCompileJob, cancellationToken);
795 }
796 catch (Exception ex)
797 {
798 Logger.LogWarning(ex, "Failed to apply remote deployment!");
799 }
800
802 }
803
813 {
814 try
815 {
816 var sessionEventTask = relayToSession ? ((IEventConsumer)this).HandleEvent(eventType, parameters, false, false, cancellationToken) : ValueTask.CompletedTask;
817 var eventConsumerTask = eventConsumer.HandleEvent(eventType, parameters, false, false, cancellationToken);
821 }
822 catch (JobException ex)
823 {
824 Logger.LogError(ex, "Suppressing exception triggered by event!");
825 }
826 }
827
834 {
835 Logger.LogTrace("Monitor restart!");
836
837 await DisposeAndNullControllers(cancellationToken);
838
839 for (var retryAttempts = 1; ; ++retryAttempts)
840 {
841 Status = WatchdogStatus.Restoring;
843 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
844 try
845 {
846 // use LaunchImplNoLock without announcements or restarting the monitor
847 await LaunchNoLock(false, false, false, null, cancellationToken);
848 Status = WatchdogStatus.Online;
849 Logger.LogDebug("Relaunch successful, resuming monitor...");
850 return;
851 }
853 {
855 }
856
857 Logger.LogWarning(launchException, "Failed to automatically restart the watchdog! Attempt: {attemptNumber}", retryAttempts);
858 Status = WatchdogStatus.DelayedRestart;
859
860 var retryDelay = Math.Min(
861 Convert.ToInt32(
862 Math.Pow(2, retryAttempts)),
863 TimeSpan.FromHours(1).TotalSeconds); // max of one hour, increasing by a power of 2 each time
864
866 $"Failed to restart (Attempt: {retryAttempts}), retrying in {retryDelay}s...");
867
869 TimeSpan.FromSeconds(retryDelay),
870 cancellationToken);
871 }
872 }
873
880 {
882
885 {
886 Logger.LogDebug("Found new CompileJob without waiting");
887 return;
888 }
889
891 }
892
898#pragma warning disable CA1502
900 {
901 Logger.LogTrace("Entered MonitorLifetimes");
902 Status = WatchdogStatus.Online;
903 using var cancellationTokenLoggingRegistration = cancellationToken.Register(() => Logger.LogTrace("Monitor cancellationToken triggered"));
904
905 // this function is responsible for calling HandlerMonitorWakeup when necessary and manitaining the MonitorState
906 try
907 {
909 Task? activeServerLifetime = null,
910 activeServerReboot = null,
911 activeServerStartup = null,
912 serverPrimed = null,
914 newDmbAvailable = null,
915 healthCheck = null;
917 var ranInitialDmbCheck = false;
918 for (ulong iteration = 1; nextAction != MonitorAction.Exit; ++iteration)
920 {
921 var nextMonitorWakeupTcs = new TaskCompletionSource();
922 try
923 {
924 Logger.LogTrace("Iteration {iteration} of monitor loop", iteration);
925 nextAction = MonitorAction.Continue;
926
928
930 {
933 {
934 if (sameController && oldTask?.IsCompleted == true)
935 return;
936
938 }
939
940 controller!.RebootGate = nextMonitorWakeupTcs.Task;
941
946
947 if (!sameController)
949
953 () =>
954 {
958 ranInitialDmbCheck = true;
959 return result;
960 });
961 }
962
963 if (controller != null)
964 {
966
971 : Task.Delay(
972 TimeSpan.FromSeconds(healthCheckSeconds),
973 cancellationToken);
974
975 // cancel waiting if requested
976 var toWaitOn = Task.WhenAny(
983 serverPrimed!);
984
985 // wait for something to happen
986 await toWaitOn.WaitAsync(cancellationToken);
987 }
988 else
989 {
990 Logger.LogError("Controller was null on monitor wakeup! Attempting restart...");
991 nextAction = MonitorAction.Restart; // excuse me wtf?
992 }
993
994 cancellationToken.ThrowIfCancellationRequested();
995 Logger.LogTrace("Monitor activated");
996
997 // always run HandleMonitorWakeup from the context of the semaphore lock
998 using (await SemaphoreSlimContext.Lock(synchronizationSemaphore, cancellationToken))
999 {
1000 // Set this sooner so chat sends don't hold us up
1002 Status = WatchdogStatus.Restoring;
1003
1004 // multiple things may have happened, handle them one at a time
1006 {
1007 MonitorActivationReason activationReason = default; // this will always be assigned before being used
1008
1010 {
1011 var taskCompleted = task?.IsCompleted == true;
1012 task = null;
1013 if (nextAction == MonitorAction.Skip)
1014 nextAction = MonitorAction.Continue;
1015 else if (taskCompleted)
1016 {
1018 return true;
1019 }
1020
1021 return false;
1022 }
1023
1024 // process the tasks in this order and call HandlerMonitorWakup for each depending on the new monitorState
1032
1034
1035 if (!anyActivation)
1037 else
1038 {
1039 Logger.LogTrace("Reason: {activationReason}", activationReason);
1040 if (activationReason == MonitorActivationReason.HealthCheck)
1042 cancellationToken);
1043 else
1046 cancellationToken);
1047 }
1048 }
1049 }
1050
1051 Logger.LogTrace("Next monitor action is to {nextAction}", nextAction);
1052
1053 // Restart if requested
1054 if (nextAction == MonitorAction.Restart)
1055 {
1056 await MonitorRestart(cancellationToken);
1057 nextAction = MonitorAction.Continue;
1058 }
1059 }
1061 {
1062 // really, this should NEVER happen
1063 Logger.LogError(
1064 e,
1065 "Monitor crashed! Iteration: {iteration}",
1066 iteration);
1067
1069 ? "Recovering"
1070 : "Shutting down";
1072 $"Monitor crashed, this should NEVER happen! Please report this, full details in logs! {nextActionMessage}. Error: {e.Message}");
1073
1074 if (disposed)
1076 else if (nextAction != MonitorAction.Exit)
1077 {
1078 if (GetActiveController()?.Lifetime.IsCompleted != true)
1079 await MonitorRestart(cancellationToken);
1080 else
1081 Logger.LogDebug("Server seems to be okay, not restarting");
1082 nextAction = MonitorAction.Continue;
1083 }
1084 }
1085 finally
1086 {
1087 nextMonitorWakeupTcs.SetResult();
1088 }
1089 }
1090 }
1092 {
1093 // stop signal
1094 Logger.LogDebug("Monitor cancelled");
1095
1096 if (releaseServers)
1097 {
1098 Logger.LogTrace("Detaching server...");
1100 if (controller != null)
1101 await controller.Release();
1102 else
1103 Logger.LogError("Controller was null on monitor shutdown!");
1104 }
1105 }
1106
1107 // DCT: Operation must always run
1109 Status = WatchdogStatus.Offline;
1110
1111 Logger.LogTrace("Monitor exiting...");
1112 }
1113#pragma warning restore CA1502
1114
1123 {
1124 if (Status == WatchdogStatus.Offline)
1125 return;
1126 if (!graceful)
1127 {
1130 ? EventType.WatchdogDetach
1131 : EventType.WatchdogShutdown,
1132 Enumerable.Empty<string>(),
1134 cancellationToken);
1135
1136 if (announce)
1137 Chat.QueueWatchdogMessage("Shutting down...");
1138
1140
1142
1143 LastLaunchParameters = null;
1144 return;
1145 }
1146
1147 // merely set the reboot state
1149 if (toKill != null)
1150 {
1151 await toKill.SetRebootState(Session.RebootState.Shutdown, cancellationToken);
1152 Logger.LogTrace("Graceful termination requested");
1153 }
1154 else
1155 Logger.LogTrace("Could not gracefully terminate as there is no active controller!");
1156 }
1157
1164 {
1165 Logger.LogTrace("Sending health check to active server...");
1167 if (activeServer == null)
1168 return MonitorAction.Restart; // uhhhh???
1169
1170 var response = await activeServer.SendCommand(new TopicParameters(), cancellationToken);
1171
1172 var shouldShutdown = activeServer.RebootState == Session.RebootState.Shutdown;
1173 if (response == null)
1174 {
1175 switch (++healthChecksMissed)
1176 {
1177 case 1:
1178 Logger.LogDebug("DEFCON 4: Game server missed first health check!");
1179 break;
1180 case 2:
1181 const string message2 = "DEFCON 3: Game server has missed 2 health checks!";
1182 Logger.LogInformation(message2);
1184 break;
1185 case 3:
1187 ? "shutdown"
1188 : "be restarted";
1189 const string logTemplate1 = "DEFCON 2: Game server has missed 3 health checks! If it does not respond to the next one, the watchdog will {actionToTake}!";
1190 Logger.LogWarning(logTemplate1, actionToTake);
1192 logTemplate1.Replace(
1193 "{actionToTake}",
1195 StringComparison.Ordinal));
1196 break;
1197 case 4:
1199 ? "Shutting down due to graceful termination request"
1200 : "Restarting";
1201 const string logTemplate2 = "DEFCON 1: Four health checks have been missed! {actionTaken}...";
1202 Logger.LogWarning(logTemplate2, actionTaken);
1204 logTemplate2.Replace(
1205 "{actionTaken}",
1207 StringComparison.Ordinal));
1208
1210 {
1211 Logger.LogDebug("DumpOnHealthCheckRestart enabled.");
1212 try
1213 {
1214 await CreateDumpNoLock(cancellationToken);
1215 }
1216 catch (JobException ex)
1217 {
1218 Logger.LogWarning(ex, "Creating dump failed!");
1219 }
1220 catch (Win32Exception ex)
1221 {
1222 Logger.LogWarning(ex, "Creating dump failed!");
1223 }
1224 }
1225 else
1226 Logger.LogTrace("DumpOnHealthCheckRestart disabled.");
1227
1228 await DisposeAndNullControllers(cancellationToken);
1230 default:
1231 Logger.LogError("Invalid health checks missed count: {healthChecksMissed}", healthChecksMissed);
1232 break;
1233 }
1234 }
1235 else
1236 {
1238 ClientCount = response.ClientCount;
1239 }
1240
1241 return MonitorAction.Continue;
1242 }
1243
1249 {
1250 if (result?.ChatResponses != null)
1251 {
1253 foreach (var response in result.ChatResponses
1254 .Where(response =>
1255 {
1256 if (response.ChannelIds == null)
1257 {
1258 if (!warnedMissingChannelIds)
1259 {
1260 Logger.LogWarning("DMAPI response contains null channelIds!");
1261 warnedMissingChannelIds = true;
1262 }
1263
1264 return false;
1265 }
1266
1267 return true;
1268 }))
1270 response,
1271 response.ChannelIds!
1272 .Select(channelIdString =>
1273 {
1274 if (UInt64.TryParse(channelIdString, out var channelId))
1275 return (ulong?)channelId;
1276 else
1277 Logger.LogWarning("Could not parse chat response channel ID: {channelID}", channelIdString);
1278
1279 return null;
1280 })
1282 .Select(nullableChannelId => nullableChannelId!.Value));
1283 }
1284 }
1285
1292 {
1293 const string DumpDirectory = "ProcessDumps";
1294
1295 var session = GetActiveController();
1296 if (session?.Lifetime.IsCompleted != false)
1297 throw new JobException(ErrorCode.GameServerOffline);
1298
1299 var dumpFileExtension = session.DumpFileExtension;
1300
1301 var dumpFileNameTemplate = diagnosticsIOManager.ResolvePath(
1302 diagnosticsIOManager.ConcatPath(
1304 $"DreamDaemon-{DateTimeOffset.UtcNow.ToFileStamp()}"));
1305
1306 var dumpFileName = $"{dumpFileNameTemplate}{dumpFileExtension}";
1307 var iteration = 0;
1308 while (await diagnosticsIOManager.FileExists(dumpFileName, cancellationToken))
1309 dumpFileName = $"{dumpFileNameTemplate} ({++iteration}){dumpFileExtension}";
1310
1311 if (iteration == 0)
1312 await diagnosticsIOManager.CreateDirectory(DumpDirectory, cancellationToken);
1313
1314 if (session.Lifetime.IsCompleted)
1315 throw new JobException(ErrorCode.GameServerOffline);
1316
1317 Logger.LogInformation("Dumping session to {dumpFileName}...", dumpFileName);
1318 await session.CreateDump(dumpFileName, ActiveLaunchParameters.Minidumps!.Value, cancellationToken);
1319 }
1320 }
1321}
virtual ? long Id
The ID of the entity.
Definition EntityId.cs:14
Metadata about a server instance.
Definition Instance.cs:9
uint? HealthCheckSeconds
The number of seconds between each watchdog health check. 0 disables.
bool? DumpOnHealthCheckRestart
If a process core dump should be created prior to restarting the watchdog due to health check failure...
Extension methods for the ValueTask and ValueTask<TResult> classes.
static async ValueTask WhenAll(IEnumerable< ValueTask > tasks)
Fully await a given list of tasks .
Represents a tgs_chat_user datum.
Definition ChatUser.cs:12
bool DmbAvailable
If LockNextDmb will succeed.
Definition DmbFactory.cs:41
Task OnNewerDmb
Get a Task that completes when the result of a call to LockNextDmb will be different than the previou...
Definition DmbFactory.cs:31
async ValueTask< CompileJob?> LatestCompileJob()
Gets the latest CompileJob.A ValueTask<TResult> resulting in the latest CompileJob or null if none ar...
const string DifferentCoreExceptionMessage
Message for the InvalidOperationException if ever a job starts on a different IInstanceCore than the ...
Definition Instance.cs:36
Represents a message to send to a chat provider.
Represents a chat command to be handled by DD.
Data structure for TopicCommandType.EventNotification requests.
static TopicParameters CreateBroadcastParameters(string broadcastMessage)
Initializes a new instance of the TopicParameters class.
Parameters necessary for duplicating a ISessionController session.
async ValueTask< ReattachInformation?> Load(CancellationToken cancellationToken)
Load a saved ReattachInformation.A ValueTask<TResult> resulting in the stored ReattachInformation if ...
ValueTask Clear(CancellationToken cancellationToken)
Clear any stored ReattachInformation.A ValueTask representing the running operation.
async ValueTask HandleRestart(Version? updateVersion, bool handlerMayDelayShutdownWithExtremelyLongRunningTasks, CancellationToken cancellationToken)
Handle a restart of the server.A ValueTask representing the running operation.
async ValueTask< bool > Broadcast(string message, CancellationToken cancellationToken)
Send a broadcast message to the DMAPI.A ValueTask<TResult> resulting in true if the broadcast succee...
async ValueTask< MonitorAction > HandleHealthCheck(CancellationToken cancellationToken)
Handles a watchdog health check.
ISessionController? GetActiveController()
Get the active ISessionController.
readonly Gauge cpuUsageMetric
Active session CPU usage as a metric.
readonly IJobManager jobManager
The IJobManager for the WatchdogBase.
ILogger< WatchdogBase > Logger
The ILogger for the WatchdogBase.
bool releaseServers
If the servers should be released instead of shutdown.
async ValueTask ReattachFailure(CancellationToken cancellationToken)
Call from InitController(ValueTask, ReattachInformation, CancellationToken) when a reattach operation...
async ValueTask CreateDumpNoLock(CancellationToken cancellationToken)
Attempt to create a process dump for the game server. Requires a lock on synchronizationSemaphore.
Models.? CompileJob ActiveCompileJob
Retrieves the Models.CompileJob currently running on the server.
DreamDaemonLaunchParameters? LastLaunchParameters
The DreamDaemonLaunchParameters the active server is using.This may not be the exact same as ActiveLa...
DateTimeOffset? LaunchTime
When the current server executions was started.
WatchdogBase(IChatManager chat, ISessionControllerFactory sessionControllerFactory, IDmbFactory dmbFactory, ISessionPersistor sessionPersistor, IJobManager jobManager, IServerControl serverControl, IAsyncDelayer asyncDelayer, IIOManager diagnosticsIOManager, IEventConsumer eventConsumer, IRemoteDeploymentManagerFactory remoteDeploymentManagerFactory, IMetricFactory metricFactory, IIOManager gameIOManager, ILogger< WatchdogBase > logger, DreamDaemonLaunchParameters initialLaunchParameters, Api.Models.Instance metadata, bool autoStart)
Initializes a new instance of the WatchdogBase class.
readonly bool autoStart
If the WatchdogBase should LaunchNoLock(bool, bool, bool, ReattachInformation, CancellationToken) in ...
async ValueTask Terminate(bool graceful, CancellationToken cancellationToken)
Stops the watchdog.A ValueTask representing the running operation.
async ValueTask CheckLaunchResult(ISessionController controller, string serverName, CancellationToken cancellationToken)
Check the LaunchResult of a given controller for errors and throw a JobException if any are detected...
readonly Api.Models.Instance metadata
The Api.Models.Instance for the WatchdogBase.
bool AlphaIsActive
If the alpha server is the active server.
readonly IEventConsumer eventConsumer
The IEventConsumer that is not the WatchdogBase.
bool disposed
If the WatchdogBase has been DisposeAsync'd.
long? WorldIteration
A incrementing ID for representing current iteration of servers world (i.e. after calling /world/proc...
readonly SemaphoreSlim controllerDisposeSemaphore
SemaphoreSlim used for DisposeAndNullControllers.
WatchdogStatus Status
The current WatchdogStatus.
ValueTask< MonitorAction > HandleMonitorWakeup(MonitorActivationReason activationReason, CancellationToken cancellationToken)
Handles the actions to take when the monitor has to "wake up".
CancellationTokenSource? monitorCts
The CancellationTokenSource for the monitor loop.
Task? monitorTask
The Task running the monitor loop.
void RunMetricsScrape()
Callback to update transient metrics.
readonly IRemoteDeploymentManagerFactory remoteDeploymentManagerFactory
The IRemoteDeploymentManagerFactory for the WatchdogBase.
readonly SemaphoreSlim synchronizationSemaphore
The SemaphoreSlim for the WatchdogBase.
async ValueTask Restart(bool graceful, CancellationToken cancellationToken)
Restarts the watchdog.A ValueTask representing the running operation.
long? MemoryUsage
Gets the memory usage of the game server in bytes.
readonly IIOManager diagnosticsIOManager
The IIOManager pointing to the Diagnostics directory.
async ValueTask< MessageContent > HandleChatCommand(string commandName, string arguments, ChatUser sender, CancellationToken cancellationToken)
Handle a chat command.A ValueTask<TResult> resulting in the MessageContent text to send back.
ValueTask InitController(ValueTask eventTask, ReattachInformation? reattachInfo, CancellationToken cancellationToken)
Starts all ISessionControllers.
async ValueTask CreateDump(CancellationToken cancellationToken)
Attempt to create a process dump for DreamDaemon.A ValueTask representing the running operation.
IIOManager GameIOManager
The IIOManager for the WatchdogBase pointing to the Game directory.
DreamDaemonLaunchParameters ActiveLaunchParameters
The DreamDaemonLaunchParameters to be applied.
uint? ClientCount
Last known client count queried from the DMAPI. Requires health checks to be enabled to populate.
async ValueTask BeforeApplyDmb(Models.CompileJob newCompileJob, CancellationToken cancellationToken)
To be called before a given newCompileJob goes live.
async ValueTask< bool > StopMonitor()
Stops MonitorLifetimes(CancellationToken). Doesn't kill the servers.
async ValueTask MonitorRestart(CancellationToken cancellationToken)
Attempt to restart the monitor from scratch.
async ValueTask Launch(CancellationToken cancellationToken)
Start the IWatchdog.A ValueTask representing the running operation.
async Task InitialCheckDmbUpdated(CompileJob currentCompileJob)
Check for a new IDmbProvider.
async ValueTask LaunchNoLock(bool startMonitor, bool announce, bool announceFailure, ReattachInformation? reattachInfo, CancellationToken cancellationToken)
Launches the watchdog.
async ValueTask DisposeAndNullControllers(CancellationToken cancellationToken)
Wrapper for DisposeAndNullControllersImpl under a locked context.
ValueTask DisposeAndNullControllersImpl()
Call IDisposable.Dispose and null the fields for all ISessionControllers.
async ValueTask TerminateNoLock(bool graceful, bool announce, CancellationToken cancellationToken)
Implementation of Terminate(bool, CancellationToken). Does not lock synchronizationSemaphore.
async ValueTask< bool > ChangeSettings(DreamDaemonLaunchParameters launchParameters, CancellationToken cancellationToken)
Changes the ActiveLaunchParameters. If currently running, may trigger a graceful restart....
void HandleChatResponses(TopicResponse? result)
Handle any TopicResponse.ChatResponses in a given topic result .
virtual async ValueTask ResetRebootState(CancellationToken cancellationToken)
Cancels pending graceful actions.A ValueTask representing the running operation.
async Task MonitorLifetimes(CancellationToken cancellationToken)
The main loop of the watchdog. Ayschronously waits for events to occur and then responds to them.
long? SessionId
An incrementing ID for representing current server execution.
async Task StartAsync(CancellationToken cancellationToken)
ValueTask InstanceRenamed(string newInstanceName, CancellationToken cancellationToken)
Called when the owning Instance is renamed.A ValueTask representing the running operation.
readonly Gauge ramUsageMetric
MemoryUsage as a metric.
int healthChecksMissed
The number of hearbeats missed.
readonly Gauge watchdogStatusMetric
The Status as a metric.
IChatManager Chat
The IChatManager for the WatchdogBase.
async Task StopAsync(CancellationToken cancellationToken)
readonly IRestartRegistration restartRegistration
The IRestartRegistration for the WatchdogBase.
async ValueTask HandleEventImpl(EventType eventType, IEnumerable< string > parameters, bool relayToSession, CancellationToken cancellationToken)
Handle a given eventType without re-throwing errors.
WatchdogStatus status
Backing field for Status.
volatile TaskCompletionSource activeParametersUpdated
TaskCompletionSource that completes when ActiveLaunchParameters are changed and we are running.
Operation exceptions thrown from the context of a Models.Job.
async ValueTask Delay(TimeSpan timeSpan, CancellationToken cancellationToken)
Create a Task that completes after a given timeSpan .A ValueTask representing the running operation.
static async ValueTask< SemaphoreSlimContext > Lock(SemaphoreSlim semaphore, CancellationToken cancellationToken, ILogger? logger=null)
Asyncronously locks a semaphore .
Helpers for manipulating the Serilog.Context.LogContext.
const string WatchdogMonitorIterationContextProperty
The Serilog.Context.LogContext property name for the ID of the watchdog monitor iteration currently b...
For managing connected chat services.
void QueueWatchdogMessage(string message)
Queue a chat message to configured watchdog channels.
void RegisterCommandHandler(ICustomCommandHandler customCommandHandler)
Registers a customCommandHandler to use.
ValueTask UpdateTrackingContexts(CancellationToken cancellationToken)
Force an update with the active channels on all active IChatTrackingContexts.
void QueueMessage(MessageContent message, IEnumerable< ulong > channelIds)
Queue a chat message to a given set of channelIds .
Handles Commands.ICommands that map to those defined in a IChatTrackingContext.
IRemoteDeploymentManager CreateRemoteDeploymentManager(Api.Models.Instance metadata, RemoteGitProvider remoteGitProvider)
Creates a IRemoteDeploymentManager for a given remoteGitProvider .
Consumes EventTypes and takes the appropriate actions.
ValueTask? HandleCustomEvent(string eventName, IEnumerable< string?> parameters, CancellationToken cancellationToken)
Handles a given custom event.
ValueTask HandleEvent(EventType eventType, IEnumerable< string?> parameters, bool sensitiveParameters, bool deploymentPipeline, CancellationToken cancellationToken)
Handle a given eventType .
Handles communication with a DreamDaemon IProcess.
Models.CompileJob CompileJob
Gets the CompileJob associated with the ISessionController.
ReattachInformation ReattachInformation
Gets the Session.ReattachInformation associated with the ISessionController.
EngineVersion EngineVersion
Gets the Api.Models.EngineVersion associated with the ISessionController.
long? StartupBridgeRequestsReceived
The number of times a startup bridge request has been received. null if DMApiAvailable is false.
Handles saving and loading ReattachInformation.
Runs and monitors the twin server controllers.
Definition IWatchdog.cs:16
Represents the lifetime of a IRestartHandler registration.
Represents a service that may take an updated Host assembly and run it, stopping the current assembly...
IRestartRegistration RegisterForRestart(IRestartHandler handler)
Register a given handler to run before stopping the server for a restart.
Interface for using filesystems.
Definition IIOManager.cs:14
Manages the runtime of Jobs.
ValueTask RegisterOperation(Job job, JobEntrypoint operation, CancellationToken cancellationToken)
Registers a given Job and begins running it.
long? MemoryUsage
Gets the process' memory usage in bytes.
DateTimeOffset? LaunchTime
When the process was started.
ErrorCode
Types of Response.ErrorMessageResponses that the API may return.
Definition ErrorCode.cs:12
JobCode
The different types of Response.JobResponse.
Definition JobCode.cs:9
WatchdogStatus
The current status of the watchdog.
@ List
User may list files if the Models.Instance allows it.
DreamDaemonRights
Rights for managing DreamDaemon.
EventType
Types of events. Mirror in tgs.dm. Prefer last listed name for script.
Definition EventType.cs:7
RebootState
Represents the action to take when /world/Reboot() is called.
Definition RebootState.cs:7
MonitorAction
The action for the monitor loop to take when control is returned to it.
MonitorActivationReason
Reasons for the monitor to wake up.