From 7c1669ace5313408e9bac7c3e0d46366f8178110 Mon Sep 17 00:00:00 2001 From: Scott Cunningham Date: Thu, 24 Nov 2016 16:09:34 +0000 Subject: [PATCH 1/3] Shutdown GraphiteHandler socket connection before closing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without shutting down the socket before closing, with carbon-relay behind an Amazon EC2 ELB, Diamond can get stuck in an inconsistent state where sockets are kept open in a CLOSE-WAIT state. For example, we let two nodes running Diamond report metrics to carbon-relay behind an ELB over a period of 24 hours with the following parameters: - node01 - running Diamond with the socket shutdown logic added (this PR) - node02 - running Diamond without the socket shutdown logic added Partway into the test (16:52), we observed that node02 stopped reporting metrics (see Grafana screenshot attached to Github PR) Upon checking the logs, we see the following errors on node02 but NOT node01: [2016-11-24 16:52:27,915] [MainThread] GraphiteHandler: Socket error, trying reconnect. [2016-11-24 17:00:43,056] [MainThread] GraphiteHandler: Socket error, trying reconnect. [2016-11-24 17:02:43,045] [MainThread] GraphiteHandler: Socket error, trying reconnect. [2016-11-24 17:06:52,968] [MainThread] GraphiteHandler: Socket error, trying reconnect. This coincides exactly with the time that the node stopped reporting metrics in to carbon. When checking the socket status on that node with `ss`, we see the following: ``` root@node02:~# ss -ntp | grep diam CLOSE-WAIT 1 0 172.20.244.54:45725 10.20.255.252:2004 users:(("diamond",12826,4),("diamond",12812,4),("diamond",12803,4),("diamond",12795,4),("diamond",12792,4),("diamond",12786,4),("diamond",12750,4)) CLOSE-WAIT 1 0 172.20.244.54:49532 10.20.255.233:2004 users:(("diamond",12780,4)) ``` For reference, following is the `pstree` output for Diamond processes: ``` ├─diamond(12750)─┬─diamond(12753)─┬─{diamond}(12782) │ │ ├─{diamond}(12788) │ │ ├─{diamond}(12814) │ │ ├─{diamond}(12828) │ │ ├─{diamond}(12853) │ │ ├─{diamond}(12854) │ │ └─{diamond}(12855) │ ├─diamond(12780) │ ├─diamond(12786) │ ├─diamond(12792) │ ├─diamond(12795) │ ├─diamond(12803) │ ├─diamond(12812) │ └─diamond(12826)``` However, on node01 which has the socket shutdown fix, we see the following sockets open: ``` root@node01:~# ss -ntp | grep diam ESTAB 0 0 172.20.245.210:52872 10.20.255.252:2004 users:(("diamond",19942,4)) ``` node01 was able to report metrics for the duration of the test. --- src/diamond/handler/graphite.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diamond/handler/graphite.py b/src/diamond/handler/graphite.py index 006a26a7c..8788d16ec 100644 --- a/src/diamond/handler/graphite.py +++ b/src/diamond/handler/graphite.py @@ -255,5 +255,6 @@ def _close(self): Close the socket """ if self.socket is not None: + self.socket.shutdown(socket.SHUT_RDWR) self.socket.close() self.socket = None From 2723624f98e9234360f81b7b88cba6156ad6118a Mon Sep 17 00:00:00 2001 From: Scott Cunningham Date: Thu, 12 Jan 2017 15:17:01 +0000 Subject: [PATCH 2/3] Revert "Merge pull request #4 from Ensighten/socket-shutdown-fix" This reverts commit ff155a4577f642957e5d108f04550d0f68acd444, reversing changes made to 015e28fbb6d47084b1f9dd8868381c310588f896. --- src/diamond/handler/graphite.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diamond/handler/graphite.py b/src/diamond/handler/graphite.py index 8788d16ec..006a26a7c 100644 --- a/src/diamond/handler/graphite.py +++ b/src/diamond/handler/graphite.py @@ -255,6 +255,5 @@ def _close(self): Close the socket """ if self.socket is not None: - self.socket.shutdown(socket.SHUT_RDWR) self.socket.close() self.socket = None From 8b6532978e4c824c46348e1574355ebe577cbf76 Mon Sep 17 00:00:00 2001 From: Scott Cunningham Date: Fri, 13 Jan 2017 11:21:48 +0000 Subject: [PATCH 3/3] Add extended flag to CPU collector This allows us to produce both simple and complex metrics - this way, simple metrics can be aggregated, but we still have the option to drill-down into per-host metrics in situations that require them (eg. checking steal on an individual VM) --- src/collectors/cpu/cpu.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/collectors/cpu/cpu.py b/src/collectors/cpu/cpu.py index fdb93f9ce..3a2aca3ac 100644 --- a/src/collectors/cpu/cpu.py +++ b/src/collectors/cpu/cpu.py @@ -43,6 +43,7 @@ def get_default_config_help(self): config_help.update({ 'percore': 'Collect metrics per cpu core or just total', 'simple': 'only return aggregate CPU% metric', + 'extended': 'return aggregate CPU% metric but also complex CPU metrics', 'normalize': 'for cpu totals, divide by the number of CPUs', }) return config_help @@ -57,6 +58,7 @@ def get_default_config(self): 'percore': 'True', 'xenfix': None, 'simple': 'False', + 'extended': 'False', 'normalize': 'False', }) return config @@ -96,7 +98,9 @@ def cpu_delta_time(interval): dt = cpu_delta_time(self.INTERVAL) cpuPct = 100 - (dt[len(dt) - 1] * 100.00 / sum(dt)) self.publish('percent', str('%.4f' % cpuPct)) - return True + # the 'extended' flag tells us to return simple AND complex CPU metrics + if not str_to_bool(self.config['extended']): + return True results = {} # Open file