installed: 3.3.4-13189-g.f88272d1e (28521) 138MB -
A user reported that the MAAS UI was hanging and all deployments were timing out. In the logs, I see rackd is reporting a critical error:
2023-08-15 23:03:05 provisioningserver.rpc.clusterservice: [critical] Failed to contact region. (While requesting RPC info at http://localhost:5240/MAAS).
Traceback (most recent call last):
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 661, in callback
self._startRunCallbacks(result)
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 763, in _startRunCallbacks
self._runCallbacks()
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1750, in gotResult
current_context.run(_inlineCallbacks, r, gen, status)
--- <exception caught here> ---
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1299, in _doUpdate
eventloops, maas_url = yield self._get_rpc_info(urls)
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1558, in _get_rpc_info
raise config_exc
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1529, in _get_rpc_info
eventloops, maas_url = yield self._parallel_fetch_rpc_info(urls)
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1503, in handle_responses
errors[0].raiseException()
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/python/failure.py", line 475, in raiseException
raise self.value.with_traceback(self.tb)
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1464, in _serial_fetch_rpc_info
raise last_exc
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1456, in _serial_fetch_rpc_info
response = yield self._fetch_rpc_info(url, orig_url)
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1656, in _inlineCallbacks
result = current_context.run(
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/python/failure.py", line 489, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1558, in _get_rpc_info
raise config_exc
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1529, in _get_rpc_info
eventloops, maas_url = yield self._parallel_fetch_rpc_info(urls)
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1503, in handle_responses
errors[0].raiseException()
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/python/failure.py", line 475, in raiseException
raise self.value.with_traceback(self.tb)
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1656, in _inlineCallbacks
result = current_context.run(
File "/snap/maas/28521/usr/lib/python3/dist-packages/twisted/python/failure.py", line 489, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1464, in _serial_fetch_rpc_info
raise last_exc
File "/snap/maas/28521/lib/python3.10/site-packages/provisioningserver/rpc/clusterservice.py", line 1456, in _serial_fetch_rpc_info
response = yield self._fetch_rpc_info(url, orig_url)
twisted.web._newclient.ResponseNeverReceived: [<twisted.python.failure.Failure twisted.internet.defer.CancelledError: >]
regionds are running:
root@maas:/var/snap/maas/common/log# ps -ef | grep regiond
root 82270 82197 1 Jun22 ? 16:41:35 python3 /snap/maas/28521/bin/regiond
root 82398 82270 29 Jun22 ? 16-08:57:38 python3 /snap/maas/28521/bin/regiond
root 82399 82270 0 Jun22 ? 06:39:56 python3 /snap/maas/28521/bin/regiond
root 82400 82270 0 Jun22 ? 13:07:29 python3 /snap/maas/28521/bin/regiond
root 82401 82270 0 Jun22 ? 06:04:17 python3 /snap/maas/28521/bin/regiond
root 3735835 3732995 0 23:04 pts/15 00:00:00 grep --color=auto regiond
But one of them is spinning on a CPU:
82398 root 20 0 1349024 406760 12472 S 110.9 0.6 23577:53 python3
strace on that spinning regiond shows that it is getting a bunch of -EAGAINs when reading a socket:
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 1
recvfrom(4, 0x55e14a3c58c0, 4096, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x55e146830bec, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=220350015}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=220577533}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bd0, FUTEX_WAKE_PRIVATE, 2147483647) = 1
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=222496459}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bec, FUTEX_WAKE_PRIVATE, 1) = 1
epoll_wait(3, [], 46, 0) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=223147519}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bf0, FUTEX_WAIT_PRIVATE, 2, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=223931288}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAKE_PRIVATE, 1) = 1
epoll_wait(3, [], 46, 5) = 0
futex(0x55e146830be8, FUTEX_WAKE_PRIVATE, 1) = 1
epoll_wait(3, [], 46, 0) = 0
futex(0x55e146830bec, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=231659458}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bec, FUTEX_WAKE_PRIVATE, 1) = 1
epoll_wait(3, [{events=EPOLLIN, data={u32=4, u64=94424856002564}}], 46, 499) = 1
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=296551151}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bd0, FUTEX_WAKE_PRIVATE, 2147483647) = 1
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=297757406}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bec, FUTEX_WAKE_PRIVATE, 1) = 1
recvfrom(4, "\0", 4096, 0, NULL, NULL) = 1
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=299812592}, FUTEX_BITSET_MATCH_ANY) = -1 ETIMEDOUT (Connection timed out)
futex(0x55e146830bec, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=305013714}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bec, FUTEX_WAKE_PRIVATE, 1) = 1
recvfrom(4, 0x55e14a3c58c0, 4096, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=309470966}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bec, FUTEX_WAKE_PRIVATE, 1) = 1
epoll_wait(3, [], 46, 0) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=312523457}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bec, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=313405840}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=314754416}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bd0, FUTEX_WAKE_PRIVATE, 2147483647) = 1
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=316072157}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bd0, FUTEX_WAKE_PRIVATE, 2147483647) = 1
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830be8, FUTEX_WAIT_BITSET_PRIVATE, 0, {tv_sec=8404744, tv_nsec=317530450}, FUTEX_BITSET_MATCH_ANY) = 0
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0x55e146830bec, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x55e146830bf0, FUTEX_WAKE_PRIVATE, 1) = 1
epoll_wait(3, ^Cstrace: Process 82398 detached
<detached ...>
fd 4 is a unix socket:
root@maas:/var/snap/maas/common/log# lsof -p 82398 | grep 4u
python3 82398 root 4u unix 0x0000000000000000 0t0 241252057 type=STREAM
who is on the other end? well, it looks like the same regiond is:
root@maas:/var/snap/maas/common/log# ss -xp | grep 241252057
u_str ESTAB 0 0 * 241252058 * 241252057 users:(("python3",pid=82398,fd=5))
u_str ESTAB 0 0 * 241252057 * 241252058 users:(("python3",pid=82398,fd=4))
... and that's about all I can discern quickly. I killed the spinning regiond at this point and things seemed to recover.
Hi @dan, do you have the full logs of the region and the rack? An sos report would be ideal btw. Did you use snap or deb?