Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make use of ~/rl_data for storing large files #92

Merged
merged 8 commits into from
Jul 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion recordlinkage/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from recordlinkage.datasets.external import *
from recordlinkage.datasets.external import clear_data_home
from recordlinkage.datasets.external import get_data_home
from recordlinkage.datasets.external import load_krebsregister
from recordlinkage.datasets.febrl import *
from recordlinkage.datasets.generate import *
58 changes: 54 additions & 4 deletions recordlinkage/datasets/external.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,59 @@


# The function get_data_home() and clear_data_home() are based on
# SciKit-Learn https://git.io/fjT70. See the 3-clause BSD license.

from io import BytesIO
from os import environ
import shutil
from pathlib import Path
from urllib.request import urlopen
import zipfile

import pandas


def get_data_home(data_home=None):
"""Return the path of the Record Linkage data folder.

This folder is used by some large dataset loaders to avoid
downloading the data several times. By default the data dir
is set to a folder named 'rl_data' in the user
home folder.
Alternatively, it can be set by the 'RL_DATA' environment
variable or programmatically by giving an explicit folder
path. The '~' symbol is expanded to the user home folder.

If the folder does not already exist, it is automatically
created.

Parameters
----------
data_home : str | None
The path to recordlinkage data folder.
"""
if data_home is None:
data_home = environ.get('RL_DATA',
Path('~', 'rl_data'))
data_home = Path(data_home).expanduser()

if not data_home.exists():
data_home.mkdir(parents=True, exist_ok=True)

return data_home


def clear_data_home(data_home=None):
"""Delete all the content of the data home cache.

Parameters
----------
data_home : str | None
The path to recordlinkage data folder.
"""
data_home = get_data_home(data_home)
shutil.rmtree(str(data_home))


def load_krebsregister(block=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
missing_values=None, shuffle=True):
Expand Down Expand Up @@ -69,7 +117,7 @@ def load_krebsregister(block=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
for i in range(1, 11):

filepath = Path(
Path(__file__).parent,
get_data_home(),
'krebsregister',
'block_{}.zip'.format(i)
)
Expand Down Expand Up @@ -102,13 +150,15 @@ def _download_krebsregister():
zip_file_url = "http://archive.ics.uci.edu/ml/" \
"machine-learning-databases/00210/donation.zip"

folder = Path(get_data_home(), 'krebsregister')

try:
print("Start downloading the data.")
print("Downloading data to {}.".format(folder))
r = urlopen(zip_file_url).read()

# unzip the content and put it in the krebsregister folder
z = zipfile.ZipFile(BytesIO(r))
z.extractall(str(Path(Path(__file__).parent, 'krebsregister')))
z.extractall(str(folder))

print("Data download succesfull.")

Expand All @@ -124,7 +174,7 @@ def _krebsregister_block(block):
"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] or list of integers.")

fp_i = Path(
Path(__file__).parent,
get_data_home(),
'krebsregister',
'block_{}.zip'.format(block)
)
Expand Down
174 changes: 100 additions & 74 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from os import environ
from pathlib import Path

import numpy

import pandas
Expand All @@ -10,6 +13,7 @@
from recordlinkage.datasets import (load_febrl1, load_febrl2, load_febrl3,
load_febrl4, load_krebsregister,
binary_vectors)
from recordlinkage.datasets import get_data_home, clear_data_home


FEBRL_DEDUP = [
Expand All @@ -22,105 +26,127 @@
]


class TestExternalDatasets(object):
@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
def test_febrl_dedup(dataset, nrows, nlinks):

df = dataset()
assert isinstance(df, pandas.DataFrame)
assert len(df) == nrows

@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
def test_febrl_dedup_links(dataset, nrows, nlinks):

df, links = dataset(return_links=True)
assert isinstance(df, pandas.DataFrame)
assert len(df) == nrows
assert len(links) == nlinks
assert isinstance(links, pandas.MultiIndex)

@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
def test_febrl_dedup_tril(dataset, nrows, nlinks):

df, links = dataset(return_links=True)

@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
def test_febrl_dedup(self, dataset, nrows, nlinks):
s_level_1 = pandas.Series(numpy.arange(len(df)), index=df.index)
s_level_2 = pandas.Series(numpy.arange(len(df)), index=df.index)

df = dataset()
assert isinstance(df, pandas.DataFrame)
assert len(df) == nrows
x1 = s_level_1.loc[links.get_level_values(0)]
x2 = s_level_2.loc[links.get_level_values(1)]

@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
def test_febrl_dedup_links(self, dataset, nrows, nlinks):
assert numpy.all(x1.values > x2.values)

df, links = dataset(return_links=True)
assert isinstance(df, pandas.DataFrame)
assert len(df) == nrows
assert len(links) == nlinks
assert isinstance(links, pandas.MultiIndex)
def test_febrl4():

@pytest.mark.parametrize("dataset,nrows,nlinks", FEBRL_DEDUP)
def test_febrl_dedup_tril(self, dataset, nrows, nlinks):
dfa, dfb = load_febrl4()
assert isinstance(dfa, pandas.DataFrame)
assert isinstance(dfb, pandas.DataFrame)
assert len(dfa) == 5000
assert len(dfb) == 5000

df, links = dataset(return_links=True)
def test_febrl_links():
dfa, dfb, links = load_febrl4(return_links=True)
assert isinstance(dfa, pandas.DataFrame)
assert isinstance(dfb, pandas.DataFrame)
assert len(dfa) == 5000
assert len(dfb) == 5000
assert isinstance(links, pandas.MultiIndex)

s_level_1 = pandas.Series(numpy.arange(len(df)), index=df.index)
s_level_2 = pandas.Series(numpy.arange(len(df)), index=df.index)
def test_krebs_dataset_download():

x1 = s_level_1.loc[links.get_level_values(0)]
x2 = s_level_2.loc[links.get_level_values(1)]
# remove downloaded datasets
clear_data_home()

assert numpy.all(x1.values > x2.values)
krebs_data, krebs_matches = load_krebsregister()

def test_febrl4(self):
for i in range(1, 11):
assert Path(
get_data_home(),
"krebsregister",
"block_{}.zip".format(i)
).is_file()

dfa, dfb = load_febrl4()
assert isinstance(dfa, pandas.DataFrame)
assert isinstance(dfb, pandas.DataFrame)
assert len(dfa) == 5000
assert len(dfb) == 5000
# count the number of recordss
assert type(krebs_data), pandas.DataFrame
assert type(krebs_matches), pandas.MultiIndex
assert len(krebs_data) == 5749132
assert len(krebs_matches) == 20931

def test_febrl_links(self):
dfa, dfb, links = load_febrl4(return_links=True)
assert isinstance(dfa, pandas.DataFrame)
assert isinstance(dfb, pandas.DataFrame)
assert len(dfa) == 5000
assert len(dfb) == 5000
assert isinstance(links, pandas.MultiIndex)
def test_krebs_dataset_environ(tmpdir):

def test_krebs_dataset(self):
path = Path(str(tmpdir)).expanduser()
environ['RL_DATA'] = str(path)

krebs_data, krebs_matches = load_krebsregister()
krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)
krebs_data, krebs_matches = load_krebsregister()

# count the number of recordss
assert type(krebs_data), pandas.DataFrame
assert type(krebs_matches), pandas.MultiIndex
assert len(krebs_data) == 5749132
assert len(krebs_matches) == 20931
for i in range(1, 11):
assert Path(
path,
"krebsregister",
"block_{}.zip".format(i)
).is_file()

assert len(krebs_data_block1) > 0
assert len(krebs_data_block10) > 0
def test_krebs_dataset():
krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)

# load not existing block
with pytest.raises(ValueError):
load_krebsregister(11)
assert len(krebs_data_block1) > 0
assert len(krebs_data_block10) > 0

# missing values
krebs_block10, matches = load_krebsregister(10, missing_values=0)
assert krebs_block10.isnull().sum().sum() == 0
# load not existing block
with pytest.raises(ValueError):
load_krebsregister(11)

def test_krebs_missings(self):
# missing values
krebs_block10, matches = load_krebsregister(10, missing_values=0)
assert krebs_block10.isnull().sum().sum() == 0

# missing values
krebs_block10, matches = load_krebsregister(10, missing_values=0)
assert krebs_block10.isnull().sum().sum() == 0
def test_krebs_missings():

def test_krebs_shuffle(self):
# missing values
krebs_block10, matches = load_krebsregister(10, missing_values=0)
assert krebs_block10.isnull().sum().sum() == 0

# missing values
krebs_block10, matches = load_krebsregister(10, shuffle=False)
def test_krebs_shuffle():

# missing values
krebs_block10, matches = load_krebsregister(10, shuffle=False)

class TestGeneratedDatasets(object):
def test_random_comparison_vectors(self):
# Test the generation of a random dataset
def test_random_comparison_vectors():
# Test the generation of a random dataset

n_record_pairs = 10000
n_matches = 500
n_record_pairs = 10000
n_matches = 500

df = binary_vectors(
n_record_pairs,
n_matches,
m=[0.8] * 8,
u=[0.2] * 8,
random_state=535)
df = binary_vectors(
n_record_pairs,
n_matches,
m=[0.8] * 8,
u=[0.2] * 8,
random_state=535)

# Check the result is a DataFrame with MultiIndex
assert isinstance(df, pandas.DataFrame)
assert isinstance(df.index, pandas.MultiIndex)
# Check the result is a DataFrame with MultiIndex
assert isinstance(df, pandas.DataFrame)
assert isinstance(df.index, pandas.MultiIndex)

# Test the length of the dataframe
assert len(df) == n_record_pairs
# Test the length of the dataframe
assert len(df) == n_record_pairs