diff options
author | Oren Hurvitz | 2014-01-14 09:55:19 +0200 |
---|---|---|
committer | Oren Hurvitz | 2014-03-25 08:01:45 +0100 |
commit | b13214af273f826699e97bd1f4dd2e49bfe43f5e (patch) | |
tree | 1ebc192be2d1160041729908f7b67e799f7f9cbb | |
parent | LogThreadPool: when the thread is for ProcessPacketMethod, also log the packe... (diff) | |
download | opensim-SC_OLD-b13214af273f826699e97bd1f4dd2e49bfe43f5e.zip opensim-SC_OLD-b13214af273f826699e97bd1f4dd2e49bfe43f5e.tar.gz opensim-SC_OLD-b13214af273f826699e97bd1f4dd2e49bfe43f5e.tar.bz2 opensim-SC_OLD-b13214af273f826699e97bd1f4dd2e49bfe43f5e.tar.xz |
Added a watchdog to abort slow threads in the main thread pool
If a task in the thread pool hangs then it will permanently take up
one of the threads. If this happens repeatedly then the thread pool will
become exhausted and OpenSim will fail. This watchdog terminates threads in the
thread pool that have been executing for over a minute (which probably
means they're hung).
Resolves http://opensimulator.org/mantis/view.php?id=6945
-rw-r--r-- | OpenSim/Framework/Util.cs | 280 |
1 files changed, 269 insertions, 11 deletions
diff --git a/OpenSim/Framework/Util.cs b/OpenSim/Framework/Util.cs index 6be1722..2de596e 100644 --- a/OpenSim/Framework/Util.cs +++ b/OpenSim/Framework/Util.cs | |||
@@ -51,6 +51,7 @@ using Nwc.XmlRpc; | |||
51 | using OpenMetaverse; | 51 | using OpenMetaverse; |
52 | using OpenMetaverse.StructuredData; | 52 | using OpenMetaverse.StructuredData; |
53 | using Amib.Threading; | 53 | using Amib.Threading; |
54 | using System.Collections.Concurrent; | ||
54 | 55 | ||
55 | namespace OpenSim.Framework | 56 | namespace OpenSim.Framework |
56 | { | 57 | { |
@@ -148,6 +149,9 @@ namespace OpenSim.Framework | |||
148 | /// </summary> | 149 | /// </summary> |
149 | private static SmartThreadPool m_ThreadPool; | 150 | private static SmartThreadPool m_ThreadPool; |
150 | 151 | ||
152 | // Watchdog timer that aborts threads that have timed-out | ||
153 | private static Timer m_threadPoolWatchdog; | ||
154 | |||
151 | // Unix-epoch starts at January 1st 1970, 00:00:00 UTC. And all our times in the server are (or at least should be) in UTC. | 155 | // Unix-epoch starts at January 1st 1970, 00:00:00 UTC. And all our times in the server are (or at least should be) in UTC. |
152 | public static readonly DateTime UnixEpoch = | 156 | public static readonly DateTime UnixEpoch = |
153 | DateTime.ParseExact("1970-01-01 00:00:00 +0", "yyyy-MM-dd hh:mm:ss z", DateTimeFormatInfo.InvariantInfo).ToUniversalTime(); | 157 | DateTime.ParseExact("1970-01-01 00:00:00 +0", "yyyy-MM-dd hh:mm:ss z", DateTimeFormatInfo.InvariantInfo).ToUniversalTime(); |
@@ -1883,6 +1887,7 @@ namespace OpenSim.Framework | |||
1883 | startInfo.MinWorkerThreads = minThreads; | 1887 | startInfo.MinWorkerThreads = minThreads; |
1884 | 1888 | ||
1885 | m_ThreadPool = new SmartThreadPool(startInfo); | 1889 | m_ThreadPool = new SmartThreadPool(startInfo); |
1890 | m_threadPoolWatchdog = new Timer(ThreadPoolWatchdog, null, 0, 1000); | ||
1886 | } | 1891 | } |
1887 | 1892 | ||
1888 | public static int FireAndForgetCount() | 1893 | public static int FireAndForgetCount() |
@@ -1907,16 +1912,117 @@ namespace OpenSim.Framework | |||
1907 | } | 1912 | } |
1908 | 1913 | ||
1909 | 1914 | ||
1915 | /// <summary> | ||
1916 | /// Additional information about threads in the main thread pool. Used to time how long the | ||
1917 | /// thread has been running, and abort it if it has timed-out. | ||
1918 | /// </summary> | ||
1919 | private class ThreadInfo | ||
1920 | { | ||
1921 | public long ThreadFuncNum { get; set; } | ||
1922 | public string StackTrace { get; set; } | ||
1923 | private string context; | ||
1924 | |||
1925 | public IWorkItemResult WorkItem { get; set; } | ||
1926 | public Thread Thread { get; set; } | ||
1927 | public bool Running { get; set; } | ||
1928 | public bool Aborted { get; set; } | ||
1929 | private int started; | ||
1930 | |||
1931 | public ThreadInfo(long threadFuncNum, string context) | ||
1932 | { | ||
1933 | ThreadFuncNum = threadFuncNum; | ||
1934 | this.context = context; | ||
1935 | Thread = null; | ||
1936 | Running = false; | ||
1937 | Aborted = false; | ||
1938 | } | ||
1939 | |||
1940 | public void Started() | ||
1941 | { | ||
1942 | Thread = Thread.CurrentThread; | ||
1943 | started = EnvironmentTickCount(); | ||
1944 | Running = true; | ||
1945 | } | ||
1946 | |||
1947 | public void Ended() | ||
1948 | { | ||
1949 | Running = false; | ||
1950 | } | ||
1951 | |||
1952 | public int Elapsed() | ||
1953 | { | ||
1954 | return EnvironmentTickCountSubtract(started); | ||
1955 | } | ||
1956 | |||
1957 | public void Abort() | ||
1958 | { | ||
1959 | Aborted = true; | ||
1960 | WorkItem.Cancel(true); | ||
1961 | } | ||
1962 | |||
1963 | /// <summary> | ||
1964 | /// Returns the thread's stack trace. | ||
1965 | /// </summary> | ||
1966 | /// <remarks> | ||
1967 | /// May return one of two stack traces. First, tries to get the thread's active stack | ||
1968 | /// trace. But this can fail, so as a fallback this method will return the stack | ||
1969 | /// trace that was active when the task was queued. | ||
1970 | /// </remarks> | ||
1971 | public string GetStackTrace() | ||
1972 | { | ||
1973 | string ret = (context == null) ? "" : ("(" + context + ") "); | ||
1974 | |||
1975 | StackTrace activeStackTrace = Util.GetStackTrace(Thread); | ||
1976 | if (activeStackTrace != null) | ||
1977 | ret += activeStackTrace.ToString(); | ||
1978 | else if (StackTrace != null) | ||
1979 | ret += "(Stack trace when queued) " + StackTrace; | ||
1980 | // else, no stack trace available | ||
1981 | |||
1982 | return ret; | ||
1983 | } | ||
1984 | } | ||
1985 | |||
1986 | |||
1910 | private static long nextThreadFuncNum = 0; | 1987 | private static long nextThreadFuncNum = 0; |
1911 | private static long numQueuedThreadFuncs = 0; | 1988 | private static long numQueuedThreadFuncs = 0; |
1912 | private static long numRunningThreadFuncs = 0; | 1989 | private static long numRunningThreadFuncs = 0; |
1913 | private static Int32 threadFuncOverloadMode = 0; | 1990 | private static Int32 threadFuncOverloadMode = 0; |
1914 | 1991 | ||
1992 | // Maps (ThreadFunc number -> Thread) | ||
1993 | private static ConcurrentDictionary<long, ThreadInfo> activeThreads = new ConcurrentDictionary<long, ThreadInfo>(); | ||
1994 | |||
1995 | private static readonly int THREAD_TIMEOUT = 60 * 1000; | ||
1996 | |||
1997 | /// <summary> | ||
1998 | /// Finds threads in the main thread pool that have timed-out, and aborts them. | ||
1999 | /// </summary> | ||
2000 | private static void ThreadPoolWatchdog(object state) | ||
2001 | { | ||
2002 | foreach (KeyValuePair<long, ThreadInfo> entry in activeThreads) | ||
2003 | { | ||
2004 | ThreadInfo t = entry.Value; | ||
2005 | if (t.Running && !t.Aborted && (t.Elapsed() >= THREAD_TIMEOUT)) | ||
2006 | { | ||
2007 | m_log.WarnFormat("Timeout in threadfunc {0} ({1}) {2}", t.ThreadFuncNum, t.Thread.Name, t.GetStackTrace()); | ||
2008 | t.Abort(); | ||
2009 | |||
2010 | ThreadInfo dummy; | ||
2011 | activeThreads.TryRemove(entry.Key, out dummy); | ||
2012 | |||
2013 | // It's possible that the thread won't abort. To make sure the thread pool isn't | ||
2014 | // depleted, increase the pool size. | ||
2015 | m_ThreadPool.MaxThreads++; | ||
2016 | } | ||
2017 | } | ||
2018 | } | ||
2019 | |||
2020 | |||
1915 | public static void FireAndForget(System.Threading.WaitCallback callback, object obj) | 2021 | public static void FireAndForget(System.Threading.WaitCallback callback, object obj) |
1916 | { | 2022 | { |
1917 | FireAndForget(callback, obj, null); | 2023 | FireAndForget(callback, obj, null); |
1918 | } | 2024 | } |
1919 | 2025 | ||
1920 | public static void FireAndForget(System.Threading.WaitCallback callback, object obj, string context) | 2026 | public static void FireAndForget(System.Threading.WaitCallback callback, object obj, string context) |
1921 | { | 2027 | { |
1922 | WaitCallback realCallback; | 2028 | WaitCallback realCallback; |
@@ -1924,6 +2030,7 @@ namespace OpenSim.Framework | |||
1924 | bool loggingEnabled = (DebugFlags & DebugFlagsEnum.LogThreadPool) != 0; | 2030 | bool loggingEnabled = (DebugFlags & DebugFlagsEnum.LogThreadPool) != 0; |
1925 | 2031 | ||
1926 | long threadFuncNum = Interlocked.Increment(ref nextThreadFuncNum); | 2032 | long threadFuncNum = Interlocked.Increment(ref nextThreadFuncNum); |
2033 | ThreadInfo threadInfo = new ThreadInfo(threadFuncNum, context); | ||
1927 | 2034 | ||
1928 | if (FireAndForgetMethod == FireAndForgetMethod.RegressionTest) | 2035 | if (FireAndForgetMethod == FireAndForgetMethod.RegressionTest) |
1929 | { | 2036 | { |
@@ -1939,6 +2046,8 @@ namespace OpenSim.Framework | |||
1939 | { | 2046 | { |
1940 | long numQueued1 = Interlocked.Decrement(ref numQueuedThreadFuncs); | 2047 | long numQueued1 = Interlocked.Decrement(ref numQueuedThreadFuncs); |
1941 | long numRunning1 = Interlocked.Increment(ref numRunningThreadFuncs); | 2048 | long numRunning1 = Interlocked.Increment(ref numRunningThreadFuncs); |
2049 | threadInfo.Started(); | ||
2050 | activeThreads[threadFuncNum] = threadInfo; | ||
1942 | 2051 | ||
1943 | try | 2052 | try |
1944 | { | 2053 | { |
@@ -1949,15 +2058,22 @@ namespace OpenSim.Framework | |||
1949 | 2058 | ||
1950 | callback(o); | 2059 | callback(o); |
1951 | } | 2060 | } |
2061 | catch (ThreadAbortException e) | ||
2062 | { | ||
2063 | m_log.Error(string.Format("Aborted threadfunc {0} ", threadFuncNum), e); | ||
2064 | } | ||
1952 | catch (Exception e) | 2065 | catch (Exception e) |
1953 | { | 2066 | { |
1954 | m_log.Error("[UTIL]: FireAndForget thread terminated with error ", e); | 2067 | m_log.Error(string.Format("[UTIL]: Util STP threadfunc {0} terminated with error ", threadFuncNum), e); |
1955 | } | 2068 | } |
1956 | finally | 2069 | finally |
1957 | { | 2070 | { |
1958 | Interlocked.Decrement(ref numRunningThreadFuncs); | 2071 | Interlocked.Decrement(ref numRunningThreadFuncs); |
2072 | threadInfo.Ended(); | ||
2073 | ThreadInfo dummy; | ||
2074 | activeThreads.TryRemove(threadFuncNum, out dummy); | ||
1959 | if (loggingEnabled || (threadFuncOverloadMode == 1)) | 2075 | if (loggingEnabled || (threadFuncOverloadMode == 1)) |
1960 | m_log.Debug("Exit threadfunc " + threadFuncNum); | 2076 | m_log.DebugFormat("Exit threadfunc {0} ({1})", threadFuncNum, FormatDuration(threadInfo.Elapsed())); |
1961 | } | 2077 | } |
1962 | }; | 2078 | }; |
1963 | } | 2079 | } |
@@ -1979,10 +2095,13 @@ namespace OpenSim.Framework | |||
1979 | 2095 | ||
1980 | if (loggingEnabled || (threadFuncOverloadMode == 1)) | 2096 | if (loggingEnabled || (threadFuncOverloadMode == 1)) |
1981 | { | 2097 | { |
2098 | string full, partial; | ||
2099 | GetFireAndForgetStackTrace(out full, out partial); | ||
1982 | m_log.DebugFormat("Queue threadfunc {0} (Queued {1}, Running {2}) {3}{4}", | 2100 | m_log.DebugFormat("Queue threadfunc {0} (Queued {1}, Running {2}) {3}{4}", |
1983 | threadFuncNum, numQueued, numRunningThreadFuncs, | 2101 | threadFuncNum, numQueued, numRunningThreadFuncs, |
1984 | (context == null) ? "" : ("(" + context + ") "), | 2102 | (context == null) ? "" : ("(" + context + ") "), |
1985 | GetFireAndForgetStackTrace(loggingEnabled)); | 2103 | loggingEnabled ? full : partial); |
2104 | threadInfo.StackTrace = full; | ||
1986 | } | 2105 | } |
1987 | 2106 | ||
1988 | switch (FireAndForgetMethod) | 2107 | switch (FireAndForgetMethod) |
@@ -2004,7 +2123,7 @@ namespace OpenSim.Framework | |||
2004 | case FireAndForgetMethod.SmartThreadPool: | 2123 | case FireAndForgetMethod.SmartThreadPool: |
2005 | if (m_ThreadPool == null) | 2124 | if (m_ThreadPool == null) |
2006 | InitThreadPool(2, 15); | 2125 | InitThreadPool(2, 15); |
2007 | m_ThreadPool.QueueWorkItem((cb, o) => cb(o), realCallback, obj); | 2126 | threadInfo.WorkItem = m_ThreadPool.QueueWorkItem((cb, o) => cb(o), realCallback, obj); |
2008 | break; | 2127 | break; |
2009 | case FireAndForgetMethod.Thread: | 2128 | case FireAndForgetMethod.Thread: |
2010 | Thread thread = new Thread(delegate(object o) { realCallback(o); }); | 2129 | Thread thread = new Thread(delegate(object o) { realCallback(o); }); |
@@ -2017,6 +2136,8 @@ namespace OpenSim.Framework | |||
2017 | catch (Exception) | 2136 | catch (Exception) |
2018 | { | 2137 | { |
2019 | Interlocked.Decrement(ref numQueuedThreadFuncs); | 2138 | Interlocked.Decrement(ref numQueuedThreadFuncs); |
2139 | ThreadInfo dummy; | ||
2140 | activeThreads.TryRemove(threadFuncNum, out dummy); | ||
2020 | throw; | 2141 | throw; |
2021 | } | 2142 | } |
2022 | } | 2143 | } |
@@ -2024,8 +2145,9 @@ namespace OpenSim.Framework | |||
2024 | /// <summary> | 2145 | /// <summary> |
2025 | /// Returns a stack trace for a thread added using FireAndForget(). | 2146 | /// Returns a stack trace for a thread added using FireAndForget(). |
2026 | /// </summary> | 2147 | /// </summary> |
2027 | /// <param name="full">True: return full stack trace; False: return only the first frame</param> | 2148 | /// <param name="full">Will contain the full stack trace</param> |
2028 | private static string GetFireAndForgetStackTrace(bool full) | 2149 | /// <param name="partial">Will contain only the first frame of the stack trace</param> |
2150 | private static void GetFireAndForgetStackTrace(out string full, out string partial) | ||
2029 | { | 2151 | { |
2030 | string src = Environment.StackTrace; | 2152 | string src = Environment.StackTrace; |
2031 | string[] lines = src.Split(new string[] { Environment.NewLine }, StringSplitOptions.None); | 2153 | string[] lines = src.Split(new string[] { Environment.NewLine }, StringSplitOptions.None); |
@@ -2034,6 +2156,7 @@ namespace OpenSim.Framework | |||
2034 | 2156 | ||
2035 | bool started = false; | 2157 | bool started = false; |
2036 | bool first = true; | 2158 | bool first = true; |
2159 | partial = ""; | ||
2037 | 2160 | ||
2038 | for (int i = 0; i < lines.Length; i++) | 2161 | for (int i = 0; i < lines.Length; i++) |
2039 | { | 2162 | { |
@@ -2051,20 +2174,101 @@ namespace OpenSim.Framework | |||
2051 | { | 2174 | { |
2052 | line = line.TrimStart(); | 2175 | line = line.TrimStart(); |
2053 | first = false; | 2176 | first = false; |
2177 | partial = line; | ||
2054 | } | 2178 | } |
2055 | 2179 | ||
2056 | bool last = (i == lines.Length - 1) || !full; | 2180 | bool last = (i == lines.Length - 1); |
2057 | if (last) | 2181 | if (last) |
2058 | dest.Append(line); | 2182 | dest.Append(line); |
2059 | else | 2183 | else |
2060 | dest.AppendLine(line); | 2184 | dest.AppendLine(line); |
2185 | } | ||
2061 | 2186 | ||
2062 | if (!full) | 2187 | full = dest.ToString(); |
2063 | break; | 2188 | } |
2189 | |||
2190 | #pragma warning disable 0618 | ||
2191 | /// <summary> | ||
2192 | /// Return the stack trace of a different thread. | ||
2193 | /// </summary> | ||
2194 | /// <remarks> | ||
2195 | /// This is complicated because the thread needs to be paused in order to get its stack | ||
2196 | /// trace. And pausing another thread can cause a deadlock. This method attempts to | ||
2197 | /// avoid deadlock by using a short timeout (200ms), after which it gives up and | ||
2198 | /// returns 'null' instead of the stack trace. | ||
2199 | /// | ||
2200 | /// Take from: http://stackoverflow.com/a/14935378 | ||
2201 | /// | ||
2202 | /// WARNING: this doesn't work in Mono. See https://bugzilla.novell.com/show_bug.cgi?id=571691 | ||
2203 | /// | ||
2204 | /// </remarks> | ||
2205 | /// <returns>The stack trace, or null if failed to get it</returns> | ||
2206 | private static StackTrace GetStackTrace(Thread targetThread) | ||
2207 | { | ||
2208 | if (IsPlatformMono) | ||
2209 | { | ||
2210 | // This doesn't work in Mono | ||
2211 | return null; | ||
2064 | } | 2212 | } |
2065 | 2213 | ||
2066 | return dest.ToString(); | 2214 | ManualResetEventSlim fallbackThreadReady = new ManualResetEventSlim(); |
2215 | ManualResetEventSlim exitedSafely = new ManualResetEventSlim(); | ||
2216 | |||
2217 | try | ||
2218 | { | ||
2219 | new Thread(delegate() | ||
2220 | { | ||
2221 | fallbackThreadReady.Set(); | ||
2222 | while (!exitedSafely.Wait(200)) | ||
2223 | { | ||
2224 | try | ||
2225 | { | ||
2226 | targetThread.Resume(); | ||
2227 | } | ||
2228 | catch (Exception) | ||
2229 | { | ||
2230 | // Whatever happens, do never stop to resume the main-thread regularly until the main-thread has exited safely. | ||
2231 | } | ||
2232 | } | ||
2233 | }).Start(); | ||
2234 | |||
2235 | fallbackThreadReady.Wait(); | ||
2236 | // From here, you have about 200ms to get the stack-trace | ||
2237 | |||
2238 | targetThread.Suspend(); | ||
2239 | |||
2240 | StackTrace trace = null; | ||
2241 | try | ||
2242 | { | ||
2243 | trace = new StackTrace(targetThread, true); | ||
2244 | } | ||
2245 | catch (ThreadStateException) | ||
2246 | { | ||
2247 | //failed to get stack trace, since the fallback-thread resumed the thread | ||
2248 | //possible reasons: | ||
2249 | //1.) This thread was just too slow | ||
2250 | //2.) A deadlock ocurred | ||
2251 | //Automatic retry seems too risky here, so just return null. | ||
2252 | } | ||
2253 | |||
2254 | try | ||
2255 | { | ||
2256 | targetThread.Resume(); | ||
2257 | } | ||
2258 | catch (ThreadStateException) | ||
2259 | { | ||
2260 | // Thread is running again already | ||
2261 | } | ||
2262 | |||
2263 | return trace; | ||
2264 | } | ||
2265 | finally | ||
2266 | { | ||
2267 | // Signal the fallack-thread to stop | ||
2268 | exitedSafely.Set(); | ||
2269 | } | ||
2067 | } | 2270 | } |
2271 | #pragma warning restore 0618 | ||
2068 | 2272 | ||
2069 | /// <summary> | 2273 | /// <summary> |
2070 | /// Get information about the current state of the smart thread pool. | 2274 | /// Get information about the current state of the smart thread pool. |
@@ -2150,6 +2354,60 @@ namespace OpenSim.Framework | |||
2150 | } | 2354 | } |
2151 | 2355 | ||
2152 | /// <summary> | 2356 | /// <summary> |
2357 | /// Formats a duration (given in milliseconds). | ||
2358 | /// </summary> | ||
2359 | public static string FormatDuration(int ms) | ||
2360 | { | ||
2361 | TimeSpan span = new TimeSpan(ms * TimeSpan.TicksPerMillisecond); | ||
2362 | |||
2363 | string str = ""; | ||
2364 | string suffix = null; | ||
2365 | |||
2366 | int hours = (int)span.TotalHours; | ||
2367 | if (hours > 0) | ||
2368 | { | ||
2369 | str += hours.ToString(str.Length == 0 ? "0" : "00"); | ||
2370 | suffix = "hours"; | ||
2371 | } | ||
2372 | |||
2373 | if ((hours > 0) || (span.Minutes > 0)) | ||
2374 | { | ||
2375 | if (str.Length > 0) | ||
2376 | str += ":"; | ||
2377 | str += span.Minutes.ToString(str.Length == 0 ? "0" : "00"); | ||
2378 | if (suffix == null) | ||
2379 | suffix = "min"; | ||
2380 | } | ||
2381 | |||
2382 | if ((hours > 0) || (span.Minutes > 0) || (span.Seconds > 0)) | ||
2383 | { | ||
2384 | if (str.Length > 0) | ||
2385 | str += ":"; | ||
2386 | str += span.Seconds.ToString(str.Length == 0 ? "0" : "00"); | ||
2387 | if (suffix == null) | ||
2388 | suffix = "sec"; | ||
2389 | } | ||
2390 | |||
2391 | if (suffix == null) | ||
2392 | suffix = "ms"; | ||
2393 | |||
2394 | if (span.TotalMinutes < 1) | ||
2395 | { | ||
2396 | int ms1 = span.Milliseconds; | ||
2397 | if (str.Length > 0) | ||
2398 | { | ||
2399 | ms1 /= 100; | ||
2400 | str += "."; | ||
2401 | } | ||
2402 | str += ms1.ToString("0"); | ||
2403 | } | ||
2404 | |||
2405 | str += " " + suffix; | ||
2406 | |||
2407 | return str; | ||
2408 | } | ||
2409 | |||
2410 | /// <summary> | ||
2153 | /// Prints the call stack at any given point. Useful for debugging. | 2411 | /// Prints the call stack at any given point. Useful for debugging. |
2154 | /// </summary> | 2412 | /// </summary> |
2155 | public static void PrintCallStack() | 2413 | public static void PrintCallStack() |