aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorOren Hurvitz2014-01-14 09:55:19 +0200
committerOren Hurvitz2014-03-25 08:01:45 +0100
commitb13214af273f826699e97bd1f4dd2e49bfe43f5e (patch)
tree1ebc192be2d1160041729908f7b67e799f7f9cbb
parentLogThreadPool: when the thread is for ProcessPacketMethod, also log the packe... (diff)
downloadopensim-SC_OLD-b13214af273f826699e97bd1f4dd2e49bfe43f5e.zip
opensim-SC_OLD-b13214af273f826699e97bd1f4dd2e49bfe43f5e.tar.gz
opensim-SC_OLD-b13214af273f826699e97bd1f4dd2e49bfe43f5e.tar.bz2
opensim-SC_OLD-b13214af273f826699e97bd1f4dd2e49bfe43f5e.tar.xz
Added a watchdog to abort slow threads in the main thread pool
If a task in the thread pool hangs then it will permanently take up one of the threads. If this happens repeatedly then the thread pool will become exhausted and OpenSim will fail. This watchdog terminates threads in the thread pool that have been executing for over a minute (which probably means they're hung). Resolves http://opensimulator.org/mantis/view.php?id=6945
-rw-r--r--OpenSim/Framework/Util.cs280
1 files changed, 269 insertions, 11 deletions
diff --git a/OpenSim/Framework/Util.cs b/OpenSim/Framework/Util.cs
index 6be1722..2de596e 100644
--- a/OpenSim/Framework/Util.cs
+++ b/OpenSim/Framework/Util.cs
@@ -51,6 +51,7 @@ using Nwc.XmlRpc;
51using OpenMetaverse; 51using OpenMetaverse;
52using OpenMetaverse.StructuredData; 52using OpenMetaverse.StructuredData;
53using Amib.Threading; 53using Amib.Threading;
54using System.Collections.Concurrent;
54 55
55namespace OpenSim.Framework 56namespace OpenSim.Framework
56{ 57{
@@ -148,6 +149,9 @@ namespace OpenSim.Framework
148 /// </summary> 149 /// </summary>
149 private static SmartThreadPool m_ThreadPool; 150 private static SmartThreadPool m_ThreadPool;
150 151
152 // Watchdog timer that aborts threads that have timed-out
153 private static Timer m_threadPoolWatchdog;
154
151 // Unix-epoch starts at January 1st 1970, 00:00:00 UTC. And all our times in the server are (or at least should be) in UTC. 155 // Unix-epoch starts at January 1st 1970, 00:00:00 UTC. And all our times in the server are (or at least should be) in UTC.
152 public static readonly DateTime UnixEpoch = 156 public static readonly DateTime UnixEpoch =
153 DateTime.ParseExact("1970-01-01 00:00:00 +0", "yyyy-MM-dd hh:mm:ss z", DateTimeFormatInfo.InvariantInfo).ToUniversalTime(); 157 DateTime.ParseExact("1970-01-01 00:00:00 +0", "yyyy-MM-dd hh:mm:ss z", DateTimeFormatInfo.InvariantInfo).ToUniversalTime();
@@ -1883,6 +1887,7 @@ namespace OpenSim.Framework
1883 startInfo.MinWorkerThreads = minThreads; 1887 startInfo.MinWorkerThreads = minThreads;
1884 1888
1885 m_ThreadPool = new SmartThreadPool(startInfo); 1889 m_ThreadPool = new SmartThreadPool(startInfo);
1890 m_threadPoolWatchdog = new Timer(ThreadPoolWatchdog, null, 0, 1000);
1886 } 1891 }
1887 1892
1888 public static int FireAndForgetCount() 1893 public static int FireAndForgetCount()
@@ -1907,16 +1912,117 @@ namespace OpenSim.Framework
1907 } 1912 }
1908 1913
1909 1914
1915 /// <summary>
1916 /// Additional information about threads in the main thread pool. Used to time how long the
1917 /// thread has been running, and abort it if it has timed-out.
1918 /// </summary>
1919 private class ThreadInfo
1920 {
1921 public long ThreadFuncNum { get; set; }
1922 public string StackTrace { get; set; }
1923 private string context;
1924
1925 public IWorkItemResult WorkItem { get; set; }
1926 public Thread Thread { get; set; }
1927 public bool Running { get; set; }
1928 public bool Aborted { get; set; }
1929 private int started;
1930
1931 public ThreadInfo(long threadFuncNum, string context)
1932 {
1933 ThreadFuncNum = threadFuncNum;
1934 this.context = context;
1935 Thread = null;
1936 Running = false;
1937 Aborted = false;
1938 }
1939
1940 public void Started()
1941 {
1942 Thread = Thread.CurrentThread;
1943 started = EnvironmentTickCount();
1944 Running = true;
1945 }
1946
1947 public void Ended()
1948 {
1949 Running = false;
1950 }
1951
1952 public int Elapsed()
1953 {
1954 return EnvironmentTickCountSubtract(started);
1955 }
1956
1957 public void Abort()
1958 {
1959 Aborted = true;
1960 WorkItem.Cancel(true);
1961 }
1962
1963 /// <summary>
1964 /// Returns the thread's stack trace.
1965 /// </summary>
1966 /// <remarks>
1967 /// May return one of two stack traces. First, tries to get the thread's active stack
1968 /// trace. But this can fail, so as a fallback this method will return the stack
1969 /// trace that was active when the task was queued.
1970 /// </remarks>
1971 public string GetStackTrace()
1972 {
1973 string ret = (context == null) ? "" : ("(" + context + ") ");
1974
1975 StackTrace activeStackTrace = Util.GetStackTrace(Thread);
1976 if (activeStackTrace != null)
1977 ret += activeStackTrace.ToString();
1978 else if (StackTrace != null)
1979 ret += "(Stack trace when queued) " + StackTrace;
1980 // else, no stack trace available
1981
1982 return ret;
1983 }
1984 }
1985
1986
1910 private static long nextThreadFuncNum = 0; 1987 private static long nextThreadFuncNum = 0;
1911 private static long numQueuedThreadFuncs = 0; 1988 private static long numQueuedThreadFuncs = 0;
1912 private static long numRunningThreadFuncs = 0; 1989 private static long numRunningThreadFuncs = 0;
1913 private static Int32 threadFuncOverloadMode = 0; 1990 private static Int32 threadFuncOverloadMode = 0;
1914 1991
1992 // Maps (ThreadFunc number -> Thread)
1993 private static ConcurrentDictionary<long, ThreadInfo> activeThreads = new ConcurrentDictionary<long, ThreadInfo>();
1994
1995 private static readonly int THREAD_TIMEOUT = 60 * 1000;
1996
1997 /// <summary>
1998 /// Finds threads in the main thread pool that have timed-out, and aborts them.
1999 /// </summary>
2000 private static void ThreadPoolWatchdog(object state)
2001 {
2002 foreach (KeyValuePair<long, ThreadInfo> entry in activeThreads)
2003 {
2004 ThreadInfo t = entry.Value;
2005 if (t.Running && !t.Aborted && (t.Elapsed() >= THREAD_TIMEOUT))
2006 {
2007 m_log.WarnFormat("Timeout in threadfunc {0} ({1}) {2}", t.ThreadFuncNum, t.Thread.Name, t.GetStackTrace());
2008 t.Abort();
2009
2010 ThreadInfo dummy;
2011 activeThreads.TryRemove(entry.Key, out dummy);
2012
2013 // It's possible that the thread won't abort. To make sure the thread pool isn't
2014 // depleted, increase the pool size.
2015 m_ThreadPool.MaxThreads++;
2016 }
2017 }
2018 }
2019
2020
1915 public static void FireAndForget(System.Threading.WaitCallback callback, object obj) 2021 public static void FireAndForget(System.Threading.WaitCallback callback, object obj)
1916 { 2022 {
1917 FireAndForget(callback, obj, null); 2023 FireAndForget(callback, obj, null);
1918 } 2024 }
1919 2025
1920 public static void FireAndForget(System.Threading.WaitCallback callback, object obj, string context) 2026 public static void FireAndForget(System.Threading.WaitCallback callback, object obj, string context)
1921 { 2027 {
1922 WaitCallback realCallback; 2028 WaitCallback realCallback;
@@ -1924,6 +2030,7 @@ namespace OpenSim.Framework
1924 bool loggingEnabled = (DebugFlags & DebugFlagsEnum.LogThreadPool) != 0; 2030 bool loggingEnabled = (DebugFlags & DebugFlagsEnum.LogThreadPool) != 0;
1925 2031
1926 long threadFuncNum = Interlocked.Increment(ref nextThreadFuncNum); 2032 long threadFuncNum = Interlocked.Increment(ref nextThreadFuncNum);
2033 ThreadInfo threadInfo = new ThreadInfo(threadFuncNum, context);
1927 2034
1928 if (FireAndForgetMethod == FireAndForgetMethod.RegressionTest) 2035 if (FireAndForgetMethod == FireAndForgetMethod.RegressionTest)
1929 { 2036 {
@@ -1939,6 +2046,8 @@ namespace OpenSim.Framework
1939 { 2046 {
1940 long numQueued1 = Interlocked.Decrement(ref numQueuedThreadFuncs); 2047 long numQueued1 = Interlocked.Decrement(ref numQueuedThreadFuncs);
1941 long numRunning1 = Interlocked.Increment(ref numRunningThreadFuncs); 2048 long numRunning1 = Interlocked.Increment(ref numRunningThreadFuncs);
2049 threadInfo.Started();
2050 activeThreads[threadFuncNum] = threadInfo;
1942 2051
1943 try 2052 try
1944 { 2053 {
@@ -1949,15 +2058,22 @@ namespace OpenSim.Framework
1949 2058
1950 callback(o); 2059 callback(o);
1951 } 2060 }
2061 catch (ThreadAbortException e)
2062 {
2063 m_log.Error(string.Format("Aborted threadfunc {0} ", threadFuncNum), e);
2064 }
1952 catch (Exception e) 2065 catch (Exception e)
1953 { 2066 {
1954 m_log.Error("[UTIL]: FireAndForget thread terminated with error ", e); 2067 m_log.Error(string.Format("[UTIL]: Util STP threadfunc {0} terminated with error ", threadFuncNum), e);
1955 } 2068 }
1956 finally 2069 finally
1957 { 2070 {
1958 Interlocked.Decrement(ref numRunningThreadFuncs); 2071 Interlocked.Decrement(ref numRunningThreadFuncs);
2072 threadInfo.Ended();
2073 ThreadInfo dummy;
2074 activeThreads.TryRemove(threadFuncNum, out dummy);
1959 if (loggingEnabled || (threadFuncOverloadMode == 1)) 2075 if (loggingEnabled || (threadFuncOverloadMode == 1))
1960 m_log.Debug("Exit threadfunc " + threadFuncNum); 2076 m_log.DebugFormat("Exit threadfunc {0} ({1})", threadFuncNum, FormatDuration(threadInfo.Elapsed()));
1961 } 2077 }
1962 }; 2078 };
1963 } 2079 }
@@ -1979,10 +2095,13 @@ namespace OpenSim.Framework
1979 2095
1980 if (loggingEnabled || (threadFuncOverloadMode == 1)) 2096 if (loggingEnabled || (threadFuncOverloadMode == 1))
1981 { 2097 {
2098 string full, partial;
2099 GetFireAndForgetStackTrace(out full, out partial);
1982 m_log.DebugFormat("Queue threadfunc {0} (Queued {1}, Running {2}) {3}{4}", 2100 m_log.DebugFormat("Queue threadfunc {0} (Queued {1}, Running {2}) {3}{4}",
1983 threadFuncNum, numQueued, numRunningThreadFuncs, 2101 threadFuncNum, numQueued, numRunningThreadFuncs,
1984 (context == null) ? "" : ("(" + context + ") "), 2102 (context == null) ? "" : ("(" + context + ") "),
1985 GetFireAndForgetStackTrace(loggingEnabled)); 2103 loggingEnabled ? full : partial);
2104 threadInfo.StackTrace = full;
1986 } 2105 }
1987 2106
1988 switch (FireAndForgetMethod) 2107 switch (FireAndForgetMethod)
@@ -2004,7 +2123,7 @@ namespace OpenSim.Framework
2004 case FireAndForgetMethod.SmartThreadPool: 2123 case FireAndForgetMethod.SmartThreadPool:
2005 if (m_ThreadPool == null) 2124 if (m_ThreadPool == null)
2006 InitThreadPool(2, 15); 2125 InitThreadPool(2, 15);
2007 m_ThreadPool.QueueWorkItem((cb, o) => cb(o), realCallback, obj); 2126 threadInfo.WorkItem = m_ThreadPool.QueueWorkItem((cb, o) => cb(o), realCallback, obj);
2008 break; 2127 break;
2009 case FireAndForgetMethod.Thread: 2128 case FireAndForgetMethod.Thread:
2010 Thread thread = new Thread(delegate(object o) { realCallback(o); }); 2129 Thread thread = new Thread(delegate(object o) { realCallback(o); });
@@ -2017,6 +2136,8 @@ namespace OpenSim.Framework
2017 catch (Exception) 2136 catch (Exception)
2018 { 2137 {
2019 Interlocked.Decrement(ref numQueuedThreadFuncs); 2138 Interlocked.Decrement(ref numQueuedThreadFuncs);
2139 ThreadInfo dummy;
2140 activeThreads.TryRemove(threadFuncNum, out dummy);
2020 throw; 2141 throw;
2021 } 2142 }
2022 } 2143 }
@@ -2024,8 +2145,9 @@ namespace OpenSim.Framework
2024 /// <summary> 2145 /// <summary>
2025 /// Returns a stack trace for a thread added using FireAndForget(). 2146 /// Returns a stack trace for a thread added using FireAndForget().
2026 /// </summary> 2147 /// </summary>
2027 /// <param name="full">True: return full stack trace; False: return only the first frame</param> 2148 /// <param name="full">Will contain the full stack trace</param>
2028 private static string GetFireAndForgetStackTrace(bool full) 2149 /// <param name="partial">Will contain only the first frame of the stack trace</param>
2150 private static void GetFireAndForgetStackTrace(out string full, out string partial)
2029 { 2151 {
2030 string src = Environment.StackTrace; 2152 string src = Environment.StackTrace;
2031 string[] lines = src.Split(new string[] { Environment.NewLine }, StringSplitOptions.None); 2153 string[] lines = src.Split(new string[] { Environment.NewLine }, StringSplitOptions.None);
@@ -2034,6 +2156,7 @@ namespace OpenSim.Framework
2034 2156
2035 bool started = false; 2157 bool started = false;
2036 bool first = true; 2158 bool first = true;
2159 partial = "";
2037 2160
2038 for (int i = 0; i < lines.Length; i++) 2161 for (int i = 0; i < lines.Length; i++)
2039 { 2162 {
@@ -2051,20 +2174,101 @@ namespace OpenSim.Framework
2051 { 2174 {
2052 line = line.TrimStart(); 2175 line = line.TrimStart();
2053 first = false; 2176 first = false;
2177 partial = line;
2054 } 2178 }
2055 2179
2056 bool last = (i == lines.Length - 1) || !full; 2180 bool last = (i == lines.Length - 1);
2057 if (last) 2181 if (last)
2058 dest.Append(line); 2182 dest.Append(line);
2059 else 2183 else
2060 dest.AppendLine(line); 2184 dest.AppendLine(line);
2185 }
2061 2186
2062 if (!full) 2187 full = dest.ToString();
2063 break; 2188 }
2189
2190#pragma warning disable 0618
2191 /// <summary>
2192 /// Return the stack trace of a different thread.
2193 /// </summary>
2194 /// <remarks>
2195 /// This is complicated because the thread needs to be paused in order to get its stack
2196 /// trace. And pausing another thread can cause a deadlock. This method attempts to
2197 /// avoid deadlock by using a short timeout (200ms), after which it gives up and
2198 /// returns 'null' instead of the stack trace.
2199 ///
2200 /// Take from: http://stackoverflow.com/a/14935378
2201 ///
2202 /// WARNING: this doesn't work in Mono. See https://bugzilla.novell.com/show_bug.cgi?id=571691
2203 ///
2204 /// </remarks>
2205 /// <returns>The stack trace, or null if failed to get it</returns>
2206 private static StackTrace GetStackTrace(Thread targetThread)
2207 {
2208 if (IsPlatformMono)
2209 {
2210 // This doesn't work in Mono
2211 return null;
2064 } 2212 }
2065 2213
2066 return dest.ToString(); 2214 ManualResetEventSlim fallbackThreadReady = new ManualResetEventSlim();
2215 ManualResetEventSlim exitedSafely = new ManualResetEventSlim();
2216
2217 try
2218 {
2219 new Thread(delegate()
2220 {
2221 fallbackThreadReady.Set();
2222 while (!exitedSafely.Wait(200))
2223 {
2224 try
2225 {
2226 targetThread.Resume();
2227 }
2228 catch (Exception)
2229 {
2230 // Whatever happens, do never stop to resume the main-thread regularly until the main-thread has exited safely.
2231 }
2232 }
2233 }).Start();
2234
2235 fallbackThreadReady.Wait();
2236 // From here, you have about 200ms to get the stack-trace
2237
2238 targetThread.Suspend();
2239
2240 StackTrace trace = null;
2241 try
2242 {
2243 trace = new StackTrace(targetThread, true);
2244 }
2245 catch (ThreadStateException)
2246 {
2247 //failed to get stack trace, since the fallback-thread resumed the thread
2248 //possible reasons:
2249 //1.) This thread was just too slow
2250 //2.) A deadlock ocurred
2251 //Automatic retry seems too risky here, so just return null.
2252 }
2253
2254 try
2255 {
2256 targetThread.Resume();
2257 }
2258 catch (ThreadStateException)
2259 {
2260 // Thread is running again already
2261 }
2262
2263 return trace;
2264 }
2265 finally
2266 {
2267 // Signal the fallack-thread to stop
2268 exitedSafely.Set();
2269 }
2067 } 2270 }
2271#pragma warning restore 0618
2068 2272
2069 /// <summary> 2273 /// <summary>
2070 /// Get information about the current state of the smart thread pool. 2274 /// Get information about the current state of the smart thread pool.
@@ -2150,6 +2354,60 @@ namespace OpenSim.Framework
2150 } 2354 }
2151 2355
2152 /// <summary> 2356 /// <summary>
2357 /// Formats a duration (given in milliseconds).
2358 /// </summary>
2359 public static string FormatDuration(int ms)
2360 {
2361 TimeSpan span = new TimeSpan(ms * TimeSpan.TicksPerMillisecond);
2362
2363 string str = "";
2364 string suffix = null;
2365
2366 int hours = (int)span.TotalHours;
2367 if (hours > 0)
2368 {
2369 str += hours.ToString(str.Length == 0 ? "0" : "00");
2370 suffix = "hours";
2371 }
2372
2373 if ((hours > 0) || (span.Minutes > 0))
2374 {
2375 if (str.Length > 0)
2376 str += ":";
2377 str += span.Minutes.ToString(str.Length == 0 ? "0" : "00");
2378 if (suffix == null)
2379 suffix = "min";
2380 }
2381
2382 if ((hours > 0) || (span.Minutes > 0) || (span.Seconds > 0))
2383 {
2384 if (str.Length > 0)
2385 str += ":";
2386 str += span.Seconds.ToString(str.Length == 0 ? "0" : "00");
2387 if (suffix == null)
2388 suffix = "sec";
2389 }
2390
2391 if (suffix == null)
2392 suffix = "ms";
2393
2394 if (span.TotalMinutes < 1)
2395 {
2396 int ms1 = span.Milliseconds;
2397 if (str.Length > 0)
2398 {
2399 ms1 /= 100;
2400 str += ".";
2401 }
2402 str += ms1.ToString("0");
2403 }
2404
2405 str += " " + suffix;
2406
2407 return str;
2408 }
2409
2410 /// <summary>
2153 /// Prints the call stack at any given point. Useful for debugging. 2411 /// Prints the call stack at any given point. Useful for debugging.
2154 /// </summary> 2412 /// </summary>
2155 public static void PrintCallStack() 2413 public static void PrintCallStack()