From e5d0121c8c47aad63461f2dd167f7b58d292b18b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Cavaill=C3=A9?= Date: Mon, 5 Jan 2015 16:39:49 +0100 Subject: [PATCH] [dogstatsd] handle properly utf8 packets Fixes #1256. We should always consider that dogstatsd receives a utf-8 encoded string through its socket, but still support unicode python strings in case we submit things programatically (e.g. useful for tests) --- aggregator.py | 10 +++++++++- tests/test_dogstatsd.py | 28 +++++++++++++++------------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/aggregator.py b/aggregator.py index 1d951f8828..a0017e272a 100644 --- a/aggregator.py +++ b/aggregator.py @@ -468,7 +468,7 @@ def parse_event_packet(self, packet): # Event syntax: # _e{5,4}:title|body|meta name = name_and_metadata[0] - metadata = unicode(name_and_metadata[1]) + metadata = name_and_metadata[1] title_length, text_length = name.split(',') title_length = int(title_length[3:]) text_length = int(text_length[:-1]) @@ -498,6 +498,14 @@ def parse_event_packet(self, packet): raise Exception(u'Unparseable event packet: %s' % packet) def submit_packets(self, packets): + # Usually we should always decode the string + # as utf-8 because `packets` passed through a + # network socket, but if submit_packets is used + # programatically and packets is unicode already + # then do not decode! + if not isinstance(packets, unicode): + packets = packets.decode('utf-8', 'replace') + for packet in packets.splitlines(): if not packet.strip(): diff --git a/tests/test_dogstatsd.py b/tests/test_dogstatsd.py index 018366a2bb..ae7255ceb5 100644 --- a/tests/test_dogstatsd.py +++ b/tests/test_dogstatsd.py @@ -601,20 +601,21 @@ def test_event_title(self): stats = MetricsAggregator('myhost') stats.submit_packets('_e{0,4}:|text') stats.submit_packets(u'_e{9,4}:2intitulé|text') + stats.submit_packets(u'_e{9,4}:2intitulé|text'.encode('utf-8')) # comes from socket stats.submit_packets('_e{14,4}:3title content|text') stats.submit_packets('_e{14,4}:4title|content|text') stats.submit_packets('_e{13,4}:5title\\ntitle|text') # \n stays escaped events = self.sort_events(stats.flush_events()) - assert len(events) == 5 - first, second, third, fourth, fifth = events + assert len(events) == 6 - nt.assert_equal(first['msg_title'], '') - nt.assert_equal(second['msg_title'], u'2intitulé') - nt.assert_equal(third['msg_title'], '3title content') - nt.assert_equal(fourth['msg_title'], '4title|content') - nt.assert_equal(fifth['msg_title'], '5title\\ntitle') + nt.assert_equal(events[0]['msg_title'], '') + nt.assert_equal(events[1]['msg_title'], u'2intitulé') + nt.assert_equal(events[2]['msg_title'], u'2intitulé') + nt.assert_equal(events[3]['msg_title'], '3title content') + nt.assert_equal(events[4]['msg_title'], '4title|content') + nt.assert_equal(events[5]['msg_title'], '5title\\ntitle') def test_event_text(self): stats = MetricsAggregator('myhost') @@ -622,16 +623,17 @@ def test_event_text(self): stats.submit_packets('_e{2,12}:t2|text|content') stats.submit_packets('_e{2,23}:t3|First line\\nSecond line') # \n is a newline stats.submit_packets(u'_e{2,19}:t4|♬ †øU †øU ¥ºu T0µ ♪') # utf-8 compliant + stats.submit_packets(u'_e{2,19}:t4|♬ †øU †øU ¥ºu T0µ ♪'.encode('utf-8')) # utf-8 compliant events = self.sort_events(stats.flush_events()) - assert len(events) == 4 - first, second, third, fourth = events + assert len(events) == 5 - nt.assert_equal(first['msg_text'], '') - nt.assert_equal(second['msg_text'], 'text|content') - nt.assert_equal(third['msg_text'], 'First line\nSecond line') - nt.assert_equal(fourth['msg_text'], u'♬ †øU †øU ¥ºu T0µ ♪') + nt.assert_equal(events[0]['msg_text'], '') + nt.assert_equal(events[1]['msg_text'], 'text|content') + nt.assert_equal(events[2]['msg_text'], 'First line\nSecond line') + nt.assert_equal(events[3]['msg_text'], u'♬ †øU †øU ¥ºu T0µ ♪') + nt.assert_equal(events[4]['msg_text'], u'♬ †øU †øU ¥ºu T0µ ♪') def test_recent_point_threshold(self): threshold = 100