From 2ecab34fe16e2862aaa8836673856802165e2b98 Mon Sep 17 00:00:00 2001 From: Benjy Weinberger Date: Thu, 3 May 2018 10:05:31 -0700 Subject: [PATCH 1/3] A utility to aggregate s3 access logs. Helps us track which binaries our S3 bandwidth costs are being spent on. --- 3rdparty/python/BUILD | 11 +++- src/python/pants/util/BUILD | 16 ++++++ src/python/pants/util/s3_log_aggregator.py | 65 ++++++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 src/python/pants/util/s3_log_aggregator.py diff --git a/3rdparty/python/BUILD b/3rdparty/python/BUILD index afaf91ac08e..1cae9504856 100644 --- a/3rdparty/python/BUILD +++ b/3rdparty/python/BUILD @@ -4,7 +4,7 @@ # see/edit requirements.txt in this directory to change deps. python_requirements() -# Only used by tests so we lift this library out of the requirements.txt +# Only used by tests, so we lift this library out of the requirements.txt # file used to bootstrap pants itself. python_requirement_library( name='antlr-3.1.3', @@ -17,6 +17,15 @@ python_requirement_library( ] ) +# Only used by a maintenance tool, so we lift this library out of the requirements.txt +# file used to bootstrap pants itself. +python_requirement_library( + name='s3logparse', + requirements=[ + python_requirement('s3-log-parse==0.1.1') + ] +) + # NB: Needed only for tests: tests/python/pants_test/engine/legacy:graph. files( name='requirements_files', diff --git a/src/python/pants/util/BUILD b/src/python/pants/util/BUILD index 4577d85d5bd..fd69c0ee63f 100644 --- a/src/python/pants/util/BUILD +++ b/src/python/pants/util/BUILD @@ -117,6 +117,22 @@ python_library( ] ) +python_library( + name = 's3_log_aggregator', + sources = ['s3_log_aggregator.py'], + dependencies = [ + '3rdparty/python:s3logparse' + ] +) + +python_binary( + name = 's3_log_aggregator_bin', + entry_point = 'pants.util.s3_log_aggregator', + dependencies = [ + ':s3_log_aggregator' + ] +) + python_library( name = 'socket', sources = ['socket.py'] diff --git a/src/python/pants/util/s3_log_aggregator.py b/src/python/pants/util/s3_log_aggregator.py new file mode 100644 index 00000000000..477dc521d53 --- /dev/null +++ b/src/python/pants/util/s3_log_aggregator.py @@ -0,0 +1,65 @@ +# coding=utf-8 +# Copyright 2018 Pants project contributors (see CONTRIBUTORS.md). +# Licensed under the Apache License, Version 2.0 (see LICENSE). + +from __future__ import (absolute_import, division, generators, nested_scopes, print_function, + unicode_literals, with_statement) + +from s3logparse.s3logparse import parse_log_lines + +import os +import sys +from collections import defaultdict + + +class S3LogAccumulator(object): + """Aggregates total downloaded bytes per file from S3 logs. + + Helps us track which binaries our S3 bandwidth costs are being spent on. + + To run, first download S3 access logs. For example, to download all logs for 4/2018, + you can use something like: + + aws s3 sync s3://logs.pantsbuild.org/binaries/ /tmp/s3logs --exclude "*" --include "2018-04-*" + + Then run this binary on the downloaded logs: + + ./pants run src/python/pants/util/:s3_log_aggregator_bin -- /tmp/s3logs + """ + + def __init__(self): + self._file_to_size = defaultdict(int) + self._file_to_count = defaultdict(int) + + def accumulate(self, logdir): + for filename in os.listdir(logdir): + with open(os.path.join(logdir, filename)) as fp: + for log_entry in parse_log_lines(fp.readlines()): + self._file_to_size[log_entry.s3_key] += log_entry.bytes_sent + self._file_to_count[log_entry.s3_key] += 1 + + def get_by_size(self): + return sorted([(path, self._file_to_count[path], size) + for (path, size) in self._file_to_size.items()], + key=lambda x: x[2], reverse=True) + + def get_by_size_prettyprinted(self): + return [(path, count, self.prettyprint_bytes(size)) + for (path, count, size) in self.get_by_size()] + + @staticmethod + def prettyprint_bytes(x): + for unit in ['B', 'KB', 'MB', 'GB']: + if abs(x) < 1024.0: + return '{:3.1f}{}'.format(x, unit) + x /= 1024.0 + return '{:.1f}TB'.format(x) + + +if __name__ == '__main__': + accumulator = S3LogAccumulator() + for logdir in sys.argv[1:]: + accumulator.accumulate(logdir) + + for path, count, total_bytes in accumulator.get_by_size_prettyprinted(): + print('{} {} {}'.format(total_bytes, count, path)) From 15a88797e764c9bc079b8132ca961120310e50ae Mon Sep 17 00:00:00 2001 From: Benjy Weinberger Date: Fri, 4 May 2018 09:55:07 -0700 Subject: [PATCH 2/3] Emit more information, but just for the top N. --- src/python/pants/util/s3_log_aggregator.py | 75 ++++++++++++++++------ 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/src/python/pants/util/s3_log_aggregator.py b/src/python/pants/util/s3_log_aggregator.py index 477dc521d53..782e41e3ee8 100644 --- a/src/python/pants/util/s3_log_aggregator.py +++ b/src/python/pants/util/s3_log_aggregator.py @@ -5,12 +5,26 @@ from __future__ import (absolute_import, division, generators, nested_scopes, print_function, unicode_literals, with_statement) -from s3logparse.s3logparse import parse_log_lines - import os import sys from collections import defaultdict +from s3logparse.s3logparse import parse_log_lines + + +class Measure(object): + def __init__(self, init_count=0, init_bytes=0): + self.count = init_count + self.bytes = init_bytes + + def __add__(self, other): + return self.__class__(self.count + other.count, self.bytes + other.bytes) + + def __iadd__(self, other): + self.count += other.count + self.bytes += other.bytes + return self + class S3LogAccumulator(object): """Aggregates total downloaded bytes per file from S3 logs. @@ -26,29 +40,55 @@ class S3LogAccumulator(object): ./pants run src/python/pants/util/:s3_log_aggregator_bin -- /tmp/s3logs """ - def __init__(self): - self._file_to_size = defaultdict(int) - self._file_to_count = defaultdict(int) + self._path_to_measure = defaultdict(Measure) + self._ip_to_measure = defaultdict(Measure) def accumulate(self, logdir): for filename in os.listdir(logdir): with open(os.path.join(logdir, filename)) as fp: for log_entry in parse_log_lines(fp.readlines()): - self._file_to_size[log_entry.s3_key] += log_entry.bytes_sent - self._file_to_count[log_entry.s3_key] += 1 + m = Measure(1, log_entry.bytes_sent) + self._path_to_measure[log_entry.s3_key] += m + self._ip_to_measure[log_entry.remote_ip] += m - def get_by_size(self): - return sorted([(path, self._file_to_count[path], size) - for (path, size) in self._file_to_size.items()], - key=lambda x: x[2], reverse=True) + def print_top_n(self, n=10): + def do_print(heading, data): + print() + print(heading) + print('=' * len(heading)) + for key, measure in data[0:n]: + print('{} {} {}'.format(measure.count, self._prettyprint_bytes(measure.bytes), key)) + do_print('Paths by count:', self.get_paths_sorted_by_count()) + do_print('Paths by bytes:', self.get_paths_sorted_by_bytes()) + do_print('IPs by count:', self.get_ips_sorted_by_count()) + do_print('IPs by bytes:', self.get_ips_sorted_by_bytes()) + print() - def get_by_size_prettyprinted(self): - return [(path, count, self.prettyprint_bytes(size)) - for (path, count, size) in self.get_by_size()] + def get_paths_sorted_by_bytes(self): + return self._get_paths(sort_key=lambda m: m.bytes) + + def get_paths_sorted_by_count(self): + return self._get_paths(sort_key=lambda m: m.count) + + def get_ips_sorted_by_bytes(self): + return self._get_ips(sort_key=lambda m: m.bytes) + + def get_ips_sorted_by_count(self): + return self._get_ips(sort_key=lambda m: m.count) + + def _get_paths(self, sort_key): + return self._get(self._path_to_measure, sort_key) + + def _get_ips(self, sort_key): + return self._get(self._ip_to_measure, sort_key) @staticmethod - def prettyprint_bytes(x): + def _get(measures_map, sort_key): + return sorted(measures_map.items(), key=lambda x: sort_key(x[1]), reverse=True) + + @staticmethod + def _prettyprint_bytes(x): for unit in ['B', 'KB', 'MB', 'GB']: if abs(x) < 1024.0: return '{:3.1f}{}'.format(x, unit) @@ -56,10 +96,9 @@ def prettyprint_bytes(x): return '{:.1f}TB'.format(x) + if __name__ == '__main__': accumulator = S3LogAccumulator() for logdir in sys.argv[1:]: accumulator.accumulate(logdir) - - for path, count, total_bytes in accumulator.get_by_size_prettyprinted(): - print('{} {} {}'.format(total_bytes, count, path)) + accumulator.print_top_n() From 611f2549b2fe110ae08a7b79529bf59e4e4ad5c5 Mon Sep 17 00:00:00 2001 From: Benjy Weinberger Date: Fri, 4 May 2018 14:54:36 -0700 Subject: [PATCH 3/3] Style nit. --- src/python/pants/util/s3_log_aggregator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/pants/util/s3_log_aggregator.py b/src/python/pants/util/s3_log_aggregator.py index 782e41e3ee8..40ec294b749 100644 --- a/src/python/pants/util/s3_log_aggregator.py +++ b/src/python/pants/util/s3_log_aggregator.py @@ -40,6 +40,7 @@ class S3LogAccumulator(object): ./pants run src/python/pants/util/:s3_log_aggregator_bin -- /tmp/s3logs """ + def __init__(self): self._path_to_measure = defaultdict(Measure) self._ip_to_measure = defaultdict(Measure)