update from master

pandas-dev · Jul 17, 2019 · b405d2d · b405d2d
2 parents 58a7da6 + 26bd34d
commit b405d2d
Show file tree

Hide file tree

Showing 1,045 changed files with 143,771 additions and 110,906 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,5 @@
 - [ ] closes #xxxx
 - [ ] tests added / passed
+- [ ] passes `black pandas`
 - [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`
 - [ ] whatsnew entry
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,16 @@
+repos:
+    - repo: https://github.com/python/black
+      rev: stable
+      hooks:
+          - id: black
+            language_version: python3.7
+    - repo: https://gitlab.com/pycqa/flake8
+      rev: 3.7.7
+      hooks:
+          - id: flake8
+            language: python_venv
+    - repo: https://github.com/pre-commit/mirrors-isort
+      rev: v4.3.20
+      hooks:
+          - id: isort
+            language: python_venv
diff --git a/.travis.yml b/.travis.yml
@@ -48,17 +48,10 @@ matrix:
       env:
         - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow"
 
-    # In allow_failures
-    - dist: trusty
-      env:
-        - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true
     allow_failures:
       - dist: trusty
         env:
           - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow"
-      - dist: trusty
-        env:
-          - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true
 
 before_install:
   - echo "before_install"
@@ -97,7 +90,6 @@ before_script:
 script:
   - echo "script start"
   - source activate pandas-dev
-  - ci/build_docs.sh
   - ci/run_tests.sh
 
 after_script:

diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2013-2016
+COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller
diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT
@@ -0,0 +1,32 @@
+Based on http://opensource.org/licenses/MIT
+
+This is a template. Complete and ship as file LICENSE the following 2
+lines (only)
+
+YEAR:
+COPYRIGHT HOLDER:
+
+and specify as
+
+License: MIT + file LICENSE
+
+Copyright (c) <YEAR>, <COPYRIGHT HOLDER>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -1,7 +1,4 @@
-tseries: pandas/_libs/lib.pyx pandas/_libs/tslib.pyx pandas/_libs/hashtable.pyx
-	python setup.py build_ext --inplace
-
-.PHONY : develop build clean clean_pyc tseries doc
+.PHONY : develop build clean clean_pyc doc lint-diff black
 
 clean:
 	-python setup.py clean
@@ -15,8 +12,11 @@ build: clean_pyc
 lint-diff:
 	git diff upstream/master --name-only -- "*.py" | xargs flake8
 
+black:
+	black . --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist)'
+
 develop: build
-	-python setup.py develop
+	python setup.py develop
 
 doc:
 	-rm -rf doc/build doc/source/generated

diff --git a/README.md b/README.md
@@ -224,7 +224,7 @@ Most development discussion is taking place on github in this repo. Further, the
 
 All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
 
-A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas-docs.github.io/pandas-docs-travis/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
+A detailed overview on how to contribute can be found in the **[contributing guide](https://dev.pandas.io/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
 
 If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out.
 

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -2,28 +2,45 @@
 
 import numpy as np
 
+from pandas._libs import lib
+
 import pandas as pd
 from pandas.util import testing as tm
 
-for imp in ['pandas.util', 'pandas.tools.hashing']:
+for imp in ["pandas.util", "pandas.tools.hashing"]:
     try:
         hashing = import_module(imp)
         break
     except (ImportError, TypeError, ValueError):
         pass
 
 
+class MaybeConvertObjects:
+    def setup(self):
+        N = 10 ** 5
+
+        data = list(range(N))
+        data[0] = pd.NaT
+        data = np.array(data)
+        self.data = data
+
+    def time_maybe_convert_objects(self):
+        lib.maybe_convert_objects(self.data)
+
+
 class Factorize:
 
-    params = [[True, False], ['int', 'uint', 'float', 'string']]
-    param_names = ['sort', 'dtype']
+    params = [[True, False], ["int", "uint", "float", "string"]]
+    param_names = ["sort", "dtype"]
 
     def setup(self, sort, dtype):
-        N = 10**5
-        data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
-                'uint': pd.UInt64Index(np.arange(N).repeat(5)),
-                'float': pd.Float64Index(np.random.randn(N).repeat(5)),
-                'string': tm.makeStringIndex(N).repeat(5)}
+        N = 10 ** 5
+        data = {
+            "int": pd.Int64Index(np.arange(N).repeat(5)),
+            "uint": pd.UInt64Index(np.arange(N).repeat(5)),
+            "float": pd.Float64Index(np.random.randn(N).repeat(5)),
+            "string": tm.makeStringIndex(N).repeat(5),
+        }
         self.idx = data[dtype]
 
     def time_factorize(self, sort, dtype):
@@ -32,15 +49,17 @@ def time_factorize(self, sort, dtype):
 
 class FactorizeUnique:
 
-    params = [[True, False], ['int', 'uint', 'float', 'string']]
-    param_names = ['sort', 'dtype']
+    params = [[True, False], ["int", "uint", "float", "string"]]
+    param_names = ["sort", "dtype"]
 
     def setup(self, sort, dtype):
-        N = 10**5
-        data = {'int': pd.Int64Index(np.arange(N)),
-                'uint': pd.UInt64Index(np.arange(N)),
-                'float': pd.Float64Index(np.arange(N)),
-                'string': tm.makeStringIndex(N)}
+        N = 10 ** 5
+        data = {
+            "int": pd.Int64Index(np.arange(N)),
+            "uint": pd.UInt64Index(np.arange(N)),
+            "float": pd.Float64Index(np.arange(N)),
+            "string": tm.makeStringIndex(N),
+        }
         self.idx = data[dtype]
         assert self.idx.is_unique
 
@@ -50,15 +69,17 @@ def time_factorize(self, sort, dtype):
 
 class Duplicated:
 
-    params = [['first', 'last', False], ['int', 'uint', 'float', 'string']]
-    param_names = ['keep', 'dtype']
+    params = [["first", "last", False], ["int", "uint", "float", "string"]]
+    param_names = ["keep", "dtype"]
 
     def setup(self, keep, dtype):
-        N = 10**5
-        data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
-                'uint': pd.UInt64Index(np.arange(N).repeat(5)),
-                'float': pd.Float64Index(np.random.randn(N).repeat(5)),
-                'string': tm.makeStringIndex(N).repeat(5)}
+        N = 10 ** 5
+        data = {
+            "int": pd.Int64Index(np.arange(N).repeat(5)),
+            "uint": pd.UInt64Index(np.arange(N).repeat(5)),
+            "float": pd.Float64Index(np.random.randn(N).repeat(5)),
+            "string": tm.makeStringIndex(N).repeat(5),
+        }
         self.idx = data[dtype]
         # cache is_unique
         self.idx.is_unique
@@ -69,15 +90,17 @@ def time_duplicated(self, keep, dtype):
 
 class DuplicatedUniqueIndex:
 
-    params = ['int', 'uint', 'float', 'string']
-    param_names = ['dtype']
+    params = ["int", "uint", "float", "string"]
+    param_names = ["dtype"]
 
     def setup(self, dtype):
-        N = 10**5
-        data = {'int': pd.Int64Index(np.arange(N)),
-                'uint': pd.UInt64Index(np.arange(N)),
-                'float': pd.Float64Index(np.random.randn(N)),
-                'string': tm.makeStringIndex(N)}
+        N = 10 ** 5
+        data = {
+            "int": pd.Int64Index(np.arange(N)),
+            "uint": pd.UInt64Index(np.arange(N)),
+            "float": pd.Float64Index(np.random.randn(N)),
+            "string": tm.makeStringIndex(N),
+        }
         self.idx = data[dtype]
         # cache is_unique
         self.idx.is_unique
@@ -87,58 +110,77 @@ def time_duplicated_unique(self, dtype):
 
 
 class Hashing:
-
     def setup_cache(self):
-        N = 10**5
+        N = 10 ** 5
 
         df = pd.DataFrame(
-            {'strings': pd.Series(tm.makeStringIndex(10000).take(
-                np.random.randint(0, 10000, size=N))),
-             'floats': np.random.randn(N),
-             'ints': np.arange(N),
-             'dates': pd.date_range('20110101', freq='s', periods=N),
-             'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
-        df['categories'] = df['strings'].astype('category')
+            {
+                "strings": pd.Series(
+                    tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N))
+                ),
+                "floats": np.random.randn(N),
+                "ints": np.arange(N),
+                "dates": pd.date_range("20110101", freq="s", periods=N),
+                "timedeltas": pd.timedelta_range("1 day", freq="s", periods=N),
+            }
+        )
+        df["categories"] = df["strings"].astype("category")
         df.iloc[10:20] = np.nan
         return df
 
     def time_frame(self, df):
         hashing.hash_pandas_object(df)
 
     def time_series_int(self, df):
-        hashing.hash_pandas_object(df['ints'])
+        hashing.hash_pandas_object(df["ints"])
 
     def time_series_string(self, df):
-        hashing.hash_pandas_object(df['strings'])
+        hashing.hash_pandas_object(df["strings"])
 
     def time_series_float(self, df):
-        hashing.hash_pandas_object(df['floats'])
+        hashing.hash_pandas_object(df["floats"])
 
     def time_series_categorical(self, df):
-        hashing.hash_pandas_object(df['categories'])
+        hashing.hash_pandas_object(df["categories"])
 
     def time_series_timedeltas(self, df):
-        hashing.hash_pandas_object(df['timedeltas'])
+        hashing.hash_pandas_object(df["timedeltas"])
 
     def time_series_dates(self, df):
-        hashing.hash_pandas_object(df['dates'])
+        hashing.hash_pandas_object(df["dates"])
 
 
 class Quantile:
-    params = [[0, 0.5, 1],
-              ['linear', 'nearest', 'lower', 'higher', 'midpoint'],
-              ['float', 'int', 'uint']]
-    param_names = ['quantile', 'interpolation', 'dtype']
+    params = [
+        [0, 0.5, 1],
+        ["linear", "nearest", "lower", "higher", "midpoint"],
+        ["float", "int", "uint"],
+    ]
+    param_names = ["quantile", "interpolation", "dtype"]
 
     def setup(self, quantile, interpolation, dtype):
-        N = 10**5
-        data = {'int': np.arange(N),
-                'uint': np.arange(N).astype(np.uint64),
-                'float': np.random.randn(N)}
+        N = 10 ** 5
+        data = {
+            "int": np.arange(N),
+            "uint": np.arange(N).astype(np.uint64),
+            "float": np.random.randn(N),
+        }
         self.idx = pd.Series(data[dtype].repeat(5))
 
     def time_quantile(self, quantile, interpolation, dtype):
         self.idx.quantile(quantile, interpolation=interpolation)
 
 
+class SortIntegerArray:
+    params = [10 ** 3, 10 ** 5]
+
+    def setup(self, N):
+        data = np.arange(N, dtype=float)
+        data[40] = np.nan
+        self.array = pd.array(data, dtype="Int64")
+
+    def time_argsort(self, N):
+        self.array.argsort()
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py
@@ -1,13 +1,13 @@
 import numpy as np
 from pandas import DataFrame
+
 try:
     from pandas.util import cache_readonly
 except ImportError:
     from pandas.util.decorators import cache_readonly
 
 
 class DataFrameAttributes:
-
     def setup(self):
         self.df = DataFrame(np.random.randn(10, 6))
         self.cur_index = self.df.index
@@ -20,14 +20,12 @@ def time_set_index(self):
 
 
 class CacheReadonly:
-
     def setup(self):
-
         class Foo:
-
             @cache_readonly
             def prop(self):
                 return 5
+
         self.obj = Foo()
 
     def time_cache_readonly(self):