Skip to content

Commit

Permalink
Standardize datasets regarding how to load user-item feedback (#278)
Browse files Browse the repository at this point in the history
  • Loading branch information
saghiles authored Dec 27, 2019
1 parent 0f2a9e7 commit 08dcdad
Show file tree
Hide file tree
Showing 41 changed files with 92 additions and 121 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ Load the built-in [MovieLens 100K](https://grouplens.org/datasets/movielens/100k
```python
from cornac.datasets import movielens

ml_100k = movielens.load_100k()
ml_100k = movielens.load_feedback()
```

Split the data based on ratio:
Expand Down
8 changes: 4 additions & 4 deletions cornac/datasets/amazon_clothing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
from ..data.reader import read_text


def load_rating(reader: Reader = None) -> List:
"""Load the user-item ratings
def load_feedback(reader: Reader = None) -> List:
"""Load the user-item ratings, scale: [1,5]
Parameters
----------
Expand Down Expand Up @@ -80,8 +80,8 @@ def load_image():
return features, item_ids


def load_context(reader: Reader = None) -> List:
"""Load the item-item interactions
def load_graph(reader: Reader = None) -> List:
"""Load the item-item interactions (symmetric network), built from the Amazon Also-Viewed information
Parameters
----------
Expand Down
8 changes: 4 additions & 4 deletions cornac/datasets/amazon_office.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
from ..data import Reader


def load_rating(reader: Reader = None) -> List:
"""Load the user-item ratings
def load_feedback(reader: Reader = None) -> List:
"""Load the user-item ratings, scale: [1,5]
Parameters
----------
Expand All @@ -42,8 +42,8 @@ def load_rating(reader: Reader = None) -> List:
return reader.read(fpath, sep=' ')


def load_context(reader: Reader = None) -> List:
"""Load the item-item interactions
def load_graph(reader: Reader = None) -> List:
"""Load the item-item interactions (symmetric network), built from the Amazon Also-Viewed information
Parameters
----------
Expand Down
4 changes: 2 additions & 2 deletions cornac/datasets/amazon_toy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from typing import List


def load_rating(reader: Reader = None) -> List:
"""Load the user-item ratings
def load_feedback(reader: Reader = None) -> List:
"""Load the user-item ratings, scale: [1,5]
Parameters
----------
Expand Down
2 changes: 1 addition & 1 deletion cornac/datasets/citeulike.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from ..data import Reader


def load_data(reader: Reader = None) -> List:
def load_feedback(reader: Reader = None) -> List:
"""Load the implicit feedback between users and items
Parameters
Expand Down
6 changes: 3 additions & 3 deletions cornac/datasets/epinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def _get_cache_dir():
return cache_dir


def load_data(reader: Reader = None) -> List:
"""Load the rating feedback
def load_feedback(reader: Reader = None) -> List:
"""Load user-item ratings, rating value is in [1,5]
Parameters
----------
Expand All @@ -50,7 +50,7 @@ def load_data(reader: Reader = None) -> List:


def load_trust(reader: Reader = None) -> List:
"""Load the trust data
"""Load the user trust information (undirected network)
Parameters
----------
Expand Down
46 changes: 19 additions & 27 deletions cornac/datasets/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,51 +22,43 @@
from ..data.reader import read_text

VALID_DATA_FORMATS = ['UIR', 'UIRT']
VARIANTS = ['100K', '1M']
UNZIP = {'100K':False, '1M':True}
SEP = {'100K':'\t', '1M':'::'}
URL = {'100K': 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data',
'1M': 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'}
RELATIVE_PATH = {'100K': 'ml-100k/u.data',
'1M': 'ml-1m/ratings.dat'}


def load_100k(fmt='UIR', reader=None):
"""Load the MovieLens 100K dataset
def load_feedback(fmt='UIR', variant='100K', reader=None):
"""Load the user-item ratings of one of the MovieLens datasets
Parameters
----------
fmt: str, default: 'UIR'
Data format to be returned.
Returns
-------
data: array-like
Data in the form of a list of tuples depending on the given data format.
"""
fmt = validate_format(fmt, VALID_DATA_FORMATS)
fpath = cache(url='http://files.grouplens.org/datasets/movielens/ml-100k/u.data',
relative_path='ml-100k/u.data')
reader = Reader() if reader is None else reader
return reader.read(fpath, fmt)


def load_1m(fmt='UIR', reader: Reader = None) -> List:
"""Load the MovieLens 1M dataset
variant: str, optional, default: '100K'
Specifies which MovieLens dataset to load, one of ['100K', '1M'].
Parameters
----------
fmt: str, default: 'UIR'
Data format to be returned.
reader: `obj:cornac.data.Reader`, default: None
reader: `obj:cornac.data.Reader`, optional, default: None
Reader object used to read the data.
Returns
-------
data: array-like
Data in the form of a list of tuples depending on the given data format.
"""

fmt = validate_format(fmt, VALID_DATA_FORMATS)
fpath = cache(url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
unzip=True, relative_path='ml-1m/ratings.dat')

if variant not in VARIANTS:
raise ValueError('variant must be one of {}.'.format(VARIANTS))

fpath = cache(url=URL[variant], unzip=UNZIP[variant], relative_path=RELATIVE_PATH[variant])
reader = Reader() if reader is None else reader
return reader.read(fpath, fmt, sep='::')
return reader.read(fpath, fmt, sep=SEP[variant])


def load_plot():
Expand Down
38 changes: 10 additions & 28 deletions cornac/datasets/netflix.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from ..data import Reader

VALID_DATA_FORMATS = ['UIR', 'UIRT']
VARIANTS = ['original', 'small']
FNAME = {'small':'data_small', 'original':'data'}


def _load(fname, fmt='UIR', reader: Reader = None) -> List:
Expand Down Expand Up @@ -50,17 +52,17 @@ def _load(fname, fmt='UIR', reader: Reader = None) -> List:
return reader.read(fpath, fmt, sep=',')


def load_data(fmt='UIR', reader: Reader = None) -> List:
"""Load the Netflix entire dataset
- Number of ratings: 100,480,507
- Number of users: 480,189
- Number of items: 17,770
def load_feedback(fmt='UIR', variant='original', reader: Reader = None) -> List:
"""Load Netflix user-item ratings, scale: [1,5]
Parameters
----------
fmt: str, default: 'UIR'
Data format to be returned.
variant: str, optional, default: 'original'
Specifies which Netflix dataset to load, one of ['original', 'small'].
reader: `obj:cornac.data.Reader`, default: None
Reader object used to read the data.
Expand All @@ -70,28 +72,8 @@ def load_data(fmt='UIR', reader: Reader = None) -> List:
Data in the form of a list of tuples depending on the given data format.
"""
return _load('data', fmt, reader)


def load_data_small(fmt='UIR', reader: Reader = None) -> List:
"""Load a small subset of the Netflix dataset. We draw this subsample such that
every user has at least 10 items and each item has at least 10 users.
- Number of ratings: 607,803
- Number of users: 10,000
- Number of items: 5,000
Parameters
----------
fmt: str, default: 'UIR'
Data format to be returned.

reader: `obj:cornac.data.Reader`, default: None
Reader object used to read the data.
Returns
-------
data: array-like
Data in the form of a list of tuples depending on the given data format.
if variant not in VARIANTS:
raise ValueError('variant must be one of {}.'.format(VARIANTS))

"""
return _load('data_small', fmt, reader)
return _load(FNAME[variant], fmt, reader)
4 changes: 2 additions & 2 deletions cornac/datasets/tradesy.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
from ..data.reader import read_text


def load_data(reader: Reader = None) -> List:
"""Load the feedback observations
def load_feedback(reader: Reader = None) -> List:
"""Load user-item feedback
Parameters
----------
Expand Down
2 changes: 1 addition & 1 deletion examples/biased_mf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from cornac.datasets import movielens
from cornac.eval_methods import RatioSplit

ratio_split = RatioSplit(data=movielens.load_1m(),
ratio_split = RatioSplit(data=movielens.load_feedback(variant='1M'),
test_size=0.2,
exclude_unknowns=False,
verbose=True)
Expand Down
2 changes: 1 addition & 1 deletion examples/bpr_netflix.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from cornac.datasets import netflix
from cornac.eval_methods import RatioSplit

ratio_split = RatioSplit(data=netflix.load_data_small(reader=Reader(bin_threshold=1.0)),
ratio_split = RatioSplit(data=netflix.load_feedback(variant='small', reader=Reader(bin_threshold=1.0)),
test_size=0.1, rating_threshold=1.0,
exclude_unknowns=True, verbose=True)

Expand Down
4 changes: 2 additions & 2 deletions examples/c2pf_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from cornac.datasets import amazon_office as office

# Load office ratings and item contexts, see C2PF paper for details
ratings = office.load_rating()
contexts = office.load_context()
ratings = office.load_feedback()
contexts = office.load_graph()

item_graph_modality = GraphModality(data=contexts)

Expand Down
2 changes: 1 addition & 1 deletion examples/cdl_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from cornac.data.text import BaseTokenizer

docs, item_ids = citeulike.load_text()
data = citeulike.load_data(reader=Reader(item_set=item_ids))
data = citeulike.load_feedback(reader=Reader(item_set=item_ids))

# build text modality
item_text_modality = TextModality(corpus=docs, ids=item_ids,
Expand Down
2 changes: 1 addition & 1 deletion examples/cdr_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from cornac.data.text import BaseTokenizer

docs, item_ids = citeulike.load_text()
data = citeulike.load_data(reader=Reader(item_set=item_ids))
data = citeulike.load_feedback(reader=Reader(item_set=item_ids))

# build text module
item_text_modality = TextModality(corpus=docs, ids=item_ids,
Expand Down
6 changes: 3 additions & 3 deletions examples/conv_mf_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
from cornac.data.text import BaseTokenizer

plots, movie_ids = movielens.load_plot()
ml_1m = movielens.load_1m(reader=Reader(item_set=movie_ids))
ml_1m = movielens.load_feedback(variant='1M', reader=Reader(item_set=movie_ids))

# build text modality
item_text_modality = TextModality(corpus=plots, ids=movie_ids,
tokenizer=BaseTokenizer(sep='\t', stop_words='english'),
max_vocab=8000, max_doc_freq=0.5)
tokenizer=BaseTokenizer(sep='\t', stop_words='english'),
max_vocab=8000, max_doc_freq=0.5)

ratio_split = RatioSplit(data=ml_1m, test_size=0.2, exclude_unknowns=True,
item_text=item_text_modality, verbose=True, seed=123)
Expand Down
2 changes: 1 addition & 1 deletion examples/ctr_example_citeulike.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from cornac.data.text import BaseTokenizer

docs, item_ids = citeulike.load_text()
data = citeulike.load_data(reader=Reader(item_set=item_ids))
data = citeulike.load_feedback(reader=Reader(item_set=item_ids))

# build text modality
item_text_modality = TextModality(corpus=docs, ids=item_ids,
Expand Down
2 changes: 1 addition & 1 deletion examples/cvae_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from cornac.data.text import BaseTokenizer

docs, item_ids = citeulike.load_text()
data = citeulike.load_data(reader=Reader(item_set=item_ids))
data = citeulike.load_feedback(reader=Reader(item_set=item_ids))

# build text modality
item_text_modality = TextModality(corpus=docs, ids=item_ids,
Expand Down
2 changes: 1 addition & 1 deletion examples/efm_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from cornac.data import SentimentModality
from cornac.eval_methods import RatioSplit

rating = amazon_toy.load_rating()
rating = amazon_toy.load_feedback()
sentiment = amazon_toy.load_sentiment()
md = SentimentModality(data=sentiment)

Expand Down
4 changes: 2 additions & 2 deletions examples/first_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
import cornac as cn

# Load MovieLens 100K dataset
ml_100k = cn.datasets.movielens.load_100k()
ml_100k = cn.datasets.movielens.load_feedback()

# Split data based on ratio
ratio_split = cn.eval_methods.RatioSplit(data=ml_100k, test_size=0.2, rating_threshold=4.0, seed=123)

# Here we are comparing biased MF, PMF, and BPR
mf = cn.models.MF(k=10, max_iter=25, learning_rate=0.01, lambda_reg=0.02, use_bias=True, seed=123)
pmf = cn.models.PMF(k=10, max_iter=100, learning_rate=0.001, lamda=0.001, seed=123)
bpr = cn.models.BPR(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.01, seed=123)
bpr = cn.models.BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123)

# Define metrics used to evaluate the models
mae = cn.metrics.MAE()
Expand Down
2 changes: 1 addition & 1 deletion examples/hft_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from cornac.data.text import BaseTokenizer

plots, movie_ids = movielens.load_plot()
ml_1m = movielens.load_1m(reader=Reader(item_set=movie_ids))
ml_1m = movielens.load_feedback(variant='1M', reader=Reader(item_set=movie_ids))

# build text module
item_text_modality = TextModality(corpus=plots, ids=movie_ids,
Expand Down
2 changes: 1 addition & 1 deletion examples/ibpr_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from cornac.models import IBPR

# Load the MovieLens 1M dataset
ml_1m = movielens.load_1m()
ml_1m = movielens.load_feedback(variant='1M')

# Instantiate an evaluation method.
ratio_split = RatioSplit(data=ml_1m, test_size=0.2, rating_threshold=1.0,
Expand Down
4 changes: 2 additions & 2 deletions examples/mcf_office.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from cornac.datasets import amazon_office as office

# Load office ratings and item contexts, see C2PF paper for details
ratings = office.load_rating()
contexts = office.load_context()
ratings = office.load_feedback()
contexts = office.load_graph()

item_graph_modality = GraphModality(data=contexts)

Expand Down
2 changes: 1 addition & 1 deletion examples/mter_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from cornac.models import MTER
from cornac import Experiment

data = amazon_toy.load_rating()
data = amazon_toy.load_feedback()
sentiment = amazon_toy.load_sentiment()
md = SentimentModality(data=sentiment)
eval_method = RatioSplit(data, test_size=0.2, rating_threshold=1.0,
Expand Down
2 changes: 1 addition & 1 deletion examples/ncf_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from cornac.datasets import amazon_clothing
from cornac.data import Reader

ratio_split = RatioSplit(data=amazon_clothing.load_rating(reader=Reader(bin_threshold=1.0)),
ratio_split = RatioSplit(data=amazon_clothing.load_feedback(reader=Reader(bin_threshold=1.0)),
test_size=0.2, rating_threshold=1.0, seed=123,
exclude_unknowns=True, verbose=True)

Expand Down
Loading

0 comments on commit 08dcdad

Please sign in to comment.