From fab1f109d0564b44c950055d3233d6e7a1f3be81 Mon Sep 17 00:00:00 2001
From: onefang
Date: Thu, 23 Feb 2023 02:26:36 +1000
Subject: Add yet another timeout watchdog.
---
 README.md             |  1 +
 apt-panopticommon.lua | 35 ++++++++++++++++++++++++++++-------
 apt-panopticon.lua    | 14 +++++++-------
 laggers               |  3 +++
 update_apt-panopticon | 12 ++++++++++--
 5 files changed, 49 insertions(+), 16 deletions(-)
 create mode 100755 laggers
diff --git a/README.md b/README.md
index 2edc3ca..7e5d6d2 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,7 @@ installed -
 * lua-rrd
 * LuaSocket, on Debian based systems it'll be in the lua-socket package.
 * md5sum and sha256, on Debian based systems they'll be in the coreutils package.
+* timeout, on Debian based systems it'll be in the coreutils package.
 * rrdtool
 * xz, on Debian based systems it'll be in the xz-utils package.
 
diff --git a/apt-panopticommon.lua b/apt-panopticommon.lua
index da1fc3d..a2def5a 100644
--- a/apt-panopticommon.lua
+++ b/apt-panopticommon.lua
@@ -134,7 +134,7 @@ APT.parseArgs = function(args)
     local arg = {}
     local sendArgs = ""
     -- A special test to disable IPv6 tests if IPv6 isn't available.
-    if 1 == APT.exe('ip -6 addr | grep inet6 | grep " global"'):Do().status then
+    if 1 == APT.exe('ip -6 addr | grep inet6 | grep " global"'):timeout():Do().status then
 	table.insert(args, '--tests=-IPv6')
     end
     if 0 ~= #(args) then
@@ -529,7 +529,7 @@ APT.tested = function(prot, test, host)
 end
 
 APT.exe = function(c)
-    local exe = {status = 0, result = '', log = true, cmd = c .. ' '}
+    local exe = {status = 0, result = '', log = true, cmd = c .. ' ', command = c}
 
     function exe:log()
 	self.log = true
@@ -543,6 +543,16 @@ APT.exe = function(c)
 	end
 	return self
     end
+    function exe:timeout(c)
+	-- timeout returns a status of - command status if --preserve-status; "128+9" (actually 137) if --kill-after ends up being done; 124 if it had to TERM; command status if all went well.
+	-- --kill-after means "send KILL after TERM fails.
+	if nil == c then
+	    self.cmd = 'timeout --kill-after=10.0 --foreground -v 42.0s ' .. self.cmd
+	else
+	    self.cmd = 'timeout --kill-after=10.0 --foreground -v ' .. c .. ' ' .. self.cmd
+	end
+	return self
+    end
     function exe:also(c)
 	if nil == c then c = '' else c = ' ' .. c end
 	self.cmd = self.cmd .. ';' .. c .. ' '
@@ -579,17 +589,28 @@ APT.exe = function(c)
 	    I'm getting 7168 or 0.  No idea what the fuck that is.
 	local ok, rslt, status = os.execute(s)
 	]]
-	local f = APT.readCmd(self.cmd .. ' ; echo "$?"', 'r')
+	local f = APT.readCmd(self.cmd, 'r')
 	-- The last line will be the command's returned status, collect everything else in result.
 	self.status = ''	-- Otherwise the result starts with 0.
+	self.result = '\n'
+	for i,l in ipairs(f) do
+	    self.result = self.result .. l .. "\n"
+	end
+	f = APT.readCmd('echo "$?"', 'r')
 	for i,l in ipairs(f) do
-	    self.result = self.result .. self.status .. "\n"
-	    self.status = l
+	    self.status = tonumber(l)
+	    if (137 == self.status) or (124 == self.status) then
+		print("timeout killed " .. self.status .. ' ' .. self.command)
+		E("timeout killed " .. self.status .. ' ' .. self.command)
+	    elseif (0 ~= self.status) then
+		print("status |" .. self.status .. '| ' .. self.command)
+		E("status |" .. self.status .. '| ' .. self.command)
+	    end
 	end
-	self.status = tonumber(self.status)
 	return self
     end
-    function exe:fork()
+    function exe:fork(host)
+	if nil ~= host then self.cmd = self.cmd .. ';  r=$?; if [ $r -ge 124 ]; then echo "$r ' .. host .. ' failed forked command ' .. string.gsub(self.cmd, '"', "'") .. '"; fi' end
 	self.cmd = '{ ' .. self.cmd .. '; } &'
 	if true == self.log then D("  forking -   " .. self.cmd .. "") end
 	os.execute(self.cmd)
diff --git a/apt-panopticon.lua b/apt-panopticon.lua
index 8fd39e5..93243a4 100755
--- a/apt-panopticon.lua
+++ b/apt-panopticon.lua
@@ -330,7 +330,7 @@ checkHEAD = function (host, URL, r, retry, sanity)
 	'curl -I --retry 0 -s --path-as-is --connect-timeout ' .. APT.options.timeout.value .. ' --max-redirs 0 ' .. APT.IPv46 .. ' ' ..
 	IP .. ' ' .. '-o /dev/null -D results/"HEADERS_' .. fname .. '" ' ..
 	hdr .. ' -w "#%{http_code} %{ssl_verify_result} %{url_effective}\\n" ' .. PU.scheme .. '://' .. host .. PU.path .. ' >>results/"STATUS_' .. fname .. '"'
-    ):Nice():log():Do().status
+    ):timeout(APT.options.maxtime.value * 2.0):Nice():log():Do().status
     if 0 < r then
 	APT.tested(PU.scheme, 'Redirects', host)
     else
@@ -356,7 +356,7 @@ checkHEAD = function (host, URL, r, retry, sanity)
     if 0 ~= status then
 	local msg = curlStatus[status]
 	if nil == msg then msg = "UNKNOWN CURL STATUS CODE!" end
-	if (28 == status) or (7 == status) then
+	if (128+9 == status) or (124 == status) or (28 == status) or (7 == status) then
 	    T(spcd .. spcd .. "TIMEOUT " .. timeouts + 1 .. ", retry " .. retry + 1 .. ' ' .. APT.lnk(URL), PU.scheme, sanity, host)
 	    timeouts = timeouts + 1
 	else
@@ -447,7 +447,7 @@ checkHEAD = function (host, URL, r, retry, sanity)
 		    local pth = path:match('^(.*/pool/).*$')
 		    if nil ~= pth then table.insert(APT.results[PU.scheme].redirects, pu.host .. "/" .. pth) else E(spcd .. spcd .. 'Odd redirect path ' .. path) end
 		    I(spcd .. spcd .. "Now checking redirected host " .. u .. '   for   ' .. APT.lnk(URL) .. arw .. APT.lnk(location), host)
-		    APT.exe(downloadLock .. "REDIR-" .. check .. ".log.txt" .. " ./apt-panopticon.lua " .. extraArgs .. ' ' .. pu.host .. "/" .. path .. " " .. file):Nice():log():fork()
+		    APT.exe(downloadLock .. "REDIR-" .. check .. ".log.txt" .. " ./apt-panopticon.lua " .. extraArgs .. ' ' .. pu.host .. "/" .. path .. " " .. file):timeout(APT.options.maxtime.value * 2.0):Nice():log():fork(pu.host)
 		    D(spcd .. 'logging to ' .. APT.logName(pu.host, nil, file)[2])
 		    APT.tested(PU.scheme, 'Redirects', host)
 		end
@@ -534,7 +534,7 @@ checkHost = function (orig, host, path, ip, file)
     else
 	if orig == host then
 	    I("Testing mirror " .. orig .. "" .. file)
-	    APT.exe("./apt-panopticon.lua " .. sendArgs .. " -o " .. orig .. path .. " " .. file):Nice():log():fork()
+	    APT.exe("./apt-panopticon.lua " .. sendArgs .. " -o " .. orig .. path .. " " .. file):timeout(APT.options.maxtime.value * 2.0):Nice():log():fork(orig)
 	    D('logging to ' .. APT.logName(ph.host, nil, file)[2])
 	else D("checkHost " .. orig .. arw .. host) end
     end
@@ -618,7 +618,7 @@ local downloads = function(host, URL, meta, release, list)
 	end
     end
     f:close()
-    APT.exe(cm):Nice():log():fork()
+    APT.exe(cm):timeout(APT.options.maxtime.value * 2.0):Nice():log():fork(host)
     D('logging to ' .. log .. ', with these files')
 end
 
@@ -1129,9 +1129,9 @@ if 0 < #arg then
 		APT.allpairs(ips,
 		    function(k, v)
 			if v == "A" then
-			    if APT.testing("IPv4") then APT.exe('./apt-panopticon.lua ' .. sendArgs .. ' -4 ' .. pu.host .. path .. ' ' .. k .. ' ' .. file):Nice():log():fork() end
+			    if APT.testing("IPv4") then APT.exe('./apt-panopticon.lua ' .. sendArgs .. ' -4 ' .. pu.host .. path .. ' ' .. k .. ' ' .. file):timeout(APT.options.maxtime.value * 2.0):Nice():log():fork(pu.host) end
 			elseif v == "AAAA" then
-			    if APT.testing("IPv6") then APT.exe('./apt-panopticon.lua ' .. sendArgs .. ' -6 ' .. APT.IPv46 .. ' ' .. pu.host .. path .. ' ' .. k .. ' ' .. file):Nice():log():fork() end
+			    if APT.testing("IPv6") then APT.exe('./apt-panopticon.lua ' .. sendArgs .. ' -6 ' .. APT.IPv46 .. ' ' .. pu.host .. path .. ' ' .. k .. ' ' .. file):timeout(APT.options.maxtime.value * 2.0):Nice():log():fork(pu.host) end
 			end
 			D('logging to ' .. APT.logName(pu.host, k, file)[2])
 		    end
diff --git a/laggers b/laggers
new file mode 100755
index 0000000..c5f30ee
--- /dev/null
+++ b/laggers
@@ -0,0 +1,3 @@
+#!/bin/bash
+echo "apt-panopticon processes still running -"
+ps ax -o pid,args --sort args | grep -E 'apt-panopticon\.lua | curl | dig ' | grep -v -E 'flock -n |grep -E |sh -c |timeout -k '
diff --git a/update_apt-panopticon b/update_apt-panopticon
index 8edb43d..abbc154 100755
--- a/update_apt-panopticon
+++ b/update_apt-panopticon
@@ -14,18 +14,26 @@ fi
 # Check if the lock file still exists.
 if [ -f apt-panopticon.lock ] ; then
     # Check if it's still running.
-    ps ax -eo pid,args | grep "luajit ./apt-panopticon.lua" | grep -v "grep luajit ./apt-panopticon.lua" | while read line ; do touch apt-panopticon.running ; exit ; done
+    ps ax -eo pid,args | grep "apt-panopticon.lua" | grep -v "grep apt-panopticon.lua" | while read line ; do touch apt-panopticon.running ; exit ; done
     if [ -f apt-panopticon.running ] ; then
 	echo "Previous apt-panopticon still running, exiting."
+	echo "Previous apt-panopticon still running, exiting."
+	./laggers
 	rm apt-panopticon.running
 	exit 1
     fi
     echo "Crashed apt-panopticon detected, removing stale lock file."
+    echo "Crashed apt-panopticon detected, removing stale lock file."
+    ./laggers
     rm apt-panopticon.lock
 fi
 
 rm ../results; ln -s apt-panopticon/results_old ../results
-flock -n apt-panopticon.lock ./apt-panopticon.lua && rm apt-panopticon.lock
+flock -n apt-panopticon.lock ionice -c3 nice -n 19 timeout --kill-after=20.0 --foreground -v 8.5m ./apt-panopticon.lua && rm apt-panopticon.lock
+if [ -f apt-panopticon.lock ] ; then
+    echo "apt-panopticon timed out."
+    ./laggers
+fi
 rm ../results; ln -s apt-panopticon/results ../results
 
 chown -R www-data:www-data *
-- 
cgit v1.1