J535D165 · J535D165 · Jul 12, 2019 · Mar 24, 2019 · Mar 29, 2019 · Jun 9, 2019
diff --git a/recordlinkage/datasets/__init__.py b/recordlinkage/datasets/__init__.py
@@ -1,3 +1,5 @@
-from recordlinkage.datasets.external import *
+from recordlinkage.datasets.external import clear_data_home
+from recordlinkage.datasets.external import get_data_home
+from recordlinkage.datasets.external import load_krebsregister
 from recordlinkage.datasets.febrl import *
 from recordlinkage.datasets.generate import *
diff --git a/recordlinkage/datasets/external.py b/recordlinkage/datasets/external.py
@@ -1,11 +1,59 @@
+
+
+# The function get_data_home() and clear_data_home() are based on 
+# SciKit-Learn https://git.io/fjT70. See the 3-clause BSD license. 
+
 from io import BytesIO
+from os import environ
+import shutil
 from pathlib import Path
 from urllib.request import urlopen
 import zipfile
 
 import pandas
 
 
+def get_data_home(data_home=None):
+    """Return the path of the Record Linkage data folder.
+
+    This folder is used by some large dataset loaders to avoid
+    downloading the data several times. By default the data dir
+    is set to a folder named 'rl_data' in the user
+    home folder.
+    Alternatively, it can be set by the 'RL_DATA' environment
+    variable or programmatically by giving an explicit folder
+    path. The '~' symbol is expanded to the user home folder.
+
+    If the folder does not already exist, it is automatically
+    created.
+
+    Parameters
+    ----------
+    data_home : str | None
+        The path to recordlinkage data folder.
+    """
+    if data_home is None:
+        data_home = environ.get('RL_DATA',
+                                Path('~', 'rl_data'))
+    data_home = Path(data_home).expanduser()
+
+    if not data_home.exists():
+        data_home.mkdir(parents=True, exist_ok=True)
+
+    return data_home
+
+
+def clear_data_home(data_home=None):
+    """Delete all the content of the data home cache.
+
+    Parameters
+    ----------
+    data_home : str | None
+        The path to recordlinkage data folder.
+    """
+    data_home = get_data_home(data_home)
+    shutil.rmtree(str(data_home))
+
 
 def load_krebsregister(block=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                        missing_values=None, shuffle=True):
@@ -69,7 +117,7 @@ def load_krebsregister(block=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
     for i in range(1, 11):
 
         filepath = Path(
-            Path(__file__).parent,
+            get_data_home(),
             'krebsregister', 
             'block_{}.zip'.format(i)
         )
@@ -102,13 +150,15 @@ def _download_krebsregister():
     zip_file_url = "http://archive.ics.uci.edu/ml/" \
         "machine-learning-databases/00210/donation.zip"
 
+    folder = Path(get_data_home(), 'krebsregister')
+
     try:
-        print("Start downloading the data.")
+        print("Downloading data to {}.".format(folder))
         r = urlopen(zip_file_url).read()
 
         # unzip the content and put it in the krebsregister folder
         z = zipfile.ZipFile(BytesIO(r))
-        z.extractall(str(Path(Path(__file__).parent, 'krebsregister')))
+        z.extractall(str(folder))
 
         print("Data download succesfull.")
 
@@ -124,7 +174,7 @@ def _krebsregister_block(block):
             "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] or list of integers.")
 
     fp_i = Path(
-        Path(__file__).parent,
+        get_data_home(),
         'krebsregister', 
         'block_{}.zip'.format(block)
     )

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+from os import environ
+from pathlib import Path
+
 import numpy
 
 import pandas
@@ -10,6 +13,7 @@
 from recordlinkage.datasets import (load_febrl1, load_febrl2, load_febrl3,
                                     load_febrl4, load_krebsregister,
                                     binary_vectors)
+from recordlinkage.datasets import get_data_home, clear_data_home
 
 
 FEBRL_DEDUP = [
@@ -22,105 +26,127 @@
 ]
 
 
-class TestExternalDatasets(object):
+@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
+def test_febrl_dedup(dataset, nrows, nlinks):
+
+    df = dataset()
+    assert isinstance(df, pandas.DataFrame)
+    assert len(df) == nrows
+
+@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
+def test_febrl_dedup_links(dataset, nrows, nlinks):
+
+    df, links = dataset(return_links=True)
+    assert isinstance(df, pandas.DataFrame)
+    assert len(df) == nrows
+    assert len(links) == nlinks
+    assert isinstance(links, pandas.MultiIndex)
+
+@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
+def test_febrl_dedup_tril(dataset, nrows, nlinks):
+
+    df, links = dataset(return_links=True)
 
-    @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
-    def test_febrl_dedup(self, dataset, nrows, nlinks):
+    s_level_1 = pandas.Series(numpy.arange(len(df)), index=df.index)
+    s_level_2 = pandas.Series(numpy.arange(len(df)), index=df.index)
 
-        df = dataset()
-        assert isinstance(df, pandas.DataFrame)
-        assert len(df) == nrows
+    x1 = s_level_1.loc[links.get_level_values(0)]
+    x2 = s_level_2.loc[links.get_level_values(1)]
 
-    @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
-    def test_febrl_dedup_links(self, dataset, nrows, nlinks):
+    assert numpy.all(x1.values > x2.values)
 
-        df, links = dataset(return_links=True)
-        assert isinstance(df, pandas.DataFrame)
-        assert len(df) == nrows
-        assert len(links) == nlinks
-        assert isinstance(links, pandas.MultiIndex)
+def test_febrl4():
 
-    @pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
-    def test_febrl_dedup_tril(self, dataset, nrows, nlinks):
+    dfa, dfb = load_febrl4()
+    assert isinstance(dfa, pandas.DataFrame)
+    assert isinstance(dfb, pandas.DataFrame)
+    assert len(dfa) == 5000
+    assert len(dfb) == 5000
 
-        df, links = dataset(return_links=True)
+def test_febrl_links():
+    dfa, dfb, links = load_febrl4(return_links=True)
+    assert isinstance(dfa, pandas.DataFrame)
+    assert isinstance(dfb, pandas.DataFrame)
+    assert len(dfa) == 5000
+    assert len(dfb) == 5000
+    assert isinstance(links, pandas.MultiIndex)
 
-        s_level_1 = pandas.Series(numpy.arange(len(df)), index=df.index)
-        s_level_2 = pandas.Series(numpy.arange(len(df)), index=df.index)
+def test_krebs_dataset_download():
 
-        x1 = s_level_1.loc[links.get_level_values(0)]
-        x2 = s_level_2.loc[links.get_level_values(1)]
+    # remove downloaded datasets
+    clear_data_home()
 
-        assert numpy.all(x1.values > x2.values)
+    krebs_data, krebs_matches = load_krebsregister()
 
-    def test_febrl4(self):
+    for i in range(1, 11):
+        assert Path(
+            get_data_home(), 
+            "krebsregister", 
+            "block_{}.zip".format(i)
+        ).is_file()
 
-        dfa, dfb = load_febrl4()
-        assert isinstance(dfa, pandas.DataFrame)
-        assert isinstance(dfb, pandas.DataFrame)
-        assert len(dfa) == 5000
-        assert len(dfb) == 5000
+    # count the number of recordss
+    assert type(krebs_data), pandas.DataFrame
+    assert type(krebs_matches), pandas.MultiIndex
+    assert len(krebs_data) == 5749132
+    assert len(krebs_matches) == 20931
 
-    def test_febrl_links(self):
-        dfa, dfb, links = load_febrl4(return_links=True)
-        assert isinstance(dfa, pandas.DataFrame)
-        assert isinstance(dfb, pandas.DataFrame)
-        assert len(dfa) == 5000
-        assert len(dfb) == 5000
-        assert isinstance(links, pandas.MultiIndex)
+def test_krebs_dataset_environ(tmpdir):
 
-    def test_krebs_dataset(self):
+    path = Path(str(tmpdir)).expanduser()
+    environ['RL_DATA'] = str(path)
 
-        krebs_data, krebs_matches = load_krebsregister()
-        krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
-        krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)
+    krebs_data, krebs_matches = load_krebsregister()
 
-        # count the number of recordss
-        assert type(krebs_data), pandas.DataFrame
-        assert type(krebs_matches), pandas.MultiIndex
-        assert len(krebs_data) == 5749132
-        assert len(krebs_matches) == 20931
+    for i in range(1, 11):
+        assert Path(
+            path, 
+            "krebsregister", 
+            "block_{}.zip".format(i)
+        ).is_file()
 
-        assert len(krebs_data_block1) > 0
-        assert len(krebs_data_block10) > 0
+def test_krebs_dataset():
+    krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
+    krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)
 
-        # load not existing block
-        with pytest.raises(ValueError):
-            load_krebsregister(11)
+    assert len(krebs_data_block1) > 0
+    assert len(krebs_data_block10) > 0
 
-        # missing values
-        krebs_block10, matches = load_krebsregister(10, missing_values=0)
-        assert krebs_block10.isnull().sum().sum() == 0
+    # load not existing block
+    with pytest.raises(ValueError):
+        load_krebsregister(11)
 
-    def test_krebs_missings(self):
+    # missing values
+    krebs_block10, matches = load_krebsregister(10, missing_values=0)
+    assert krebs_block10.isnull().sum().sum() == 0
 
-        # missing values
-        krebs_block10, matches = load_krebsregister(10, missing_values=0)
-        assert krebs_block10.isnull().sum().sum() == 0
+def test_krebs_missings():
 
-    def test_krebs_shuffle(self):
+    # missing values
+    krebs_block10, matches = load_krebsregister(10, missing_values=0)
+    assert krebs_block10.isnull().sum().sum() == 0
 
-        # missing values
-        krebs_block10, matches = load_krebsregister(10, shuffle=False)
+def test_krebs_shuffle():
 
+    # missing values
+    krebs_block10, matches = load_krebsregister(10, shuffle=False)
 
-class TestGeneratedDatasets(object):
-    def test_random_comparison_vectors(self):
-        # Test the generation of a random dataset
+def test_random_comparison_vectors():
+    # Test the generation of a random dataset
 
-        n_record_pairs = 10000
-        n_matches = 500
+    n_record_pairs = 10000
+    n_matches = 500
 
-        df = binary_vectors(
-            n_record_pairs,
-            n_matches,
-            m=[0.8] * 8,
-            u=[0.2] * 8,
-            random_state=535)
+    df = binary_vectors(
+        n_record_pairs,
+        n_matches,
+        m=[0.8] * 8,
+        u=[0.2] * 8,
+        random_state=535)
 
-        # Check the result is a DataFrame with MultiIndex
-        assert isinstance(df, pandas.DataFrame)
-        assert isinstance(df.index, pandas.MultiIndex)
+    # Check the result is a DataFrame with MultiIndex
+    assert isinstance(df, pandas.DataFrame)
+    assert isinstance(df.index, pandas.MultiIndex)
 
-        # Test the length of the dataframe
-        assert len(df) == n_record_pairs
+    # Test the length of the dataframe
+    assert len(df) == n_record_pairs