diff --git a/setup.py b/setup.py index 4294d1ca..14ce82c8 100644 --- a/setup.py +++ b/setup.py @@ -20,15 +20,15 @@ def read(*paths): # Prepare PACKAGE = 'tabulator' INSTALL_REQUIRES = [ - 'six>=1.9,<2.0', - 'xlrd>=1.0,<2.0', - 'ijson>=2.0,<3.0', - 'chardet>=2.0,<3.0', - 'openpyxl>=2.0,<3.0', - 'requests>=2.8,<3.0', - 'beautifulsoup4>=4.4,<5.0', - 'linear-tsv>=0.99,<0.100', - 'unicodecsv>=0.14,<0.15', + 'six>=1.9,<2.0a', + 'xlrd>=1.0,<2.0a', + 'ijson>=2.0,<3.0a', + 'chardet>=2.0,<3.0a', + 'openpyxl>=2.0,<3.0a', + 'requests>=2.8,<3.0a', + 'beautifulsoup4>=4.4,<5.0a', + 'linear-tsv>=0.99,<0.100a', + 'unicodecsv>=0.14,<0.15a', ] TESTS_REQUIRE = [ 'pylama', diff --git a/tabulator/VERSION b/tabulator/VERSION index ee6cdce3..b6160487 100644 --- a/tabulator/VERSION +++ b/tabulator/VERSION @@ -1 +1 @@ -0.6.1 +0.6.2 diff --git a/tabulator/stream.py b/tabulator/stream.py index b48f823c..c99412c3 100644 --- a/tabulator/stream.py +++ b/tabulator/stream.py @@ -80,7 +80,7 @@ def __init__(self, loader_options=None, parser_options=None,): - # Init parameters + # Defaults if loader_options is None: loader_options = {} if parser_options is None: @@ -90,7 +90,7 @@ def __init__(self, if sample_size is None: sample_size = helpers.DEFAULT_SAMPLE_SIZE - # Set headers + # Headers self.__headers = None self.__headers_row = 0 if isinstance(headers, (tuple, list)): @@ -102,7 +102,7 @@ def __init__(self, msg = msg % (self.__headers_row, sample_size) raise exceptions.TabulatorException(msg) - # Set loader + # Loader if scheme is None: scheme = helpers.detect_scheme(source) or helpers.DEFAULT_SCHEME if scheme not in _LOADERS: @@ -110,7 +110,7 @@ def __init__(self, raise exceptions.LoadingError(message) self.__loader = _LOADERS[scheme](**loader_options) - # Set parser + # Parser if format is None: format = helpers.detect_format(source) if format not in _PARSERS: @@ -118,7 +118,7 @@ def __init__(self, raise exceptions.ParsingError(message) self.__parser = _PARSERS[format](**parser_options) - # Set attributes + # Attributes self.__source = source self.__encoding = encoding self.__post_parse = post_parse @@ -184,7 +184,9 @@ def sample(self): """list[]: sample of rows """ sample = [] - for number, headers, row in self.__sample_extended_rows: + iterator = iter(self.__sample_extended_rows) + iterator = self.__apply_processors(iterator) + for number, headers, row in iterator: sample.append(row) return sample @@ -199,16 +201,19 @@ def iter(self, keyed=False, extended=False): mixed[]/mixed{}: row/keyed row/extended row """ - extended_rows = self.__iter_exteneded_rows() - for processor in self.__post_parse: - extended_rows = processor(extended_rows) - for number, headers, row in extended_rows: - if extended: - yield (number, headers, row) - elif keyed: - yield dict(zip(headers, row)) - else: - yield row + iterator = chain( + self.__sample_extended_rows, + self.__parser.extended_rows) + iterator = self.__apply_processors(iterator) + for number, headers, row in iterator: + if number > self.__number: + self.__number = number + if extended: + yield (number, headers, row) + elif keyed: + yield dict(zip(headers, row)) + else: + yield row def read(self, keyed=False, extended=False, limit=None): """Return table rows with count limit. @@ -285,19 +290,17 @@ def __detect_html(self): msg = 'Source has been detected as HTML (not supported)' raise exceptions.TabulatorException(msg) - def __iter_exteneded_rows(self): - - # Prepare iterator - iterator = chain( - self.__sample_extended_rows, - self.__parser.extended_rows) + def __apply_processors(self, iterator): - # Iter extended rows - for number, headers, row in iterator: - if number > self.__number: - self.__number = number + # Apply processors to iterator + def builtin_processor(extended_rows): + for number, headers, row in extended_rows: headers = self.__headers yield (number, headers, row) + processors = [builtin_processor] + self.__post_parse + for processor in processors: + iterator = processor(iterator) + return iterator # Internal diff --git a/tests/test_topen.py b/tests/test_topen.py index 1811e043..21545c40 100644 --- a/tests/test_topen.py +++ b/tests/test_topen.py @@ -524,7 +524,21 @@ def cast_rows(extended_rows): # Make assertions assert table.headers == ['id', 'name'] - assert table.read() == [[2, '中国人']] + + +def test_processors_sample(): + + # Processors + def only_first_row(extended_rows): + for number, header, row in extended_rows: + if number == 1: + yield (number, header, row) + + # Get table + table = topen('data/table.csv', post_parse=[only_first_row]) + + # Make assertions + assert table.sample == [['id', 'name']] # Tests [save]