Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add convert to recordio function #2608

Merged
merged 8 commits into from
Jun 27, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 23 additions & 8 deletions python/paddle/v2/dataset/cifar.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
import cPickle
import itertools
import numpy
from common import download
import paddle.v2.dataset.common
import tarfile

__all__ = ['train100', 'test100', 'train10', 'test10']
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']

URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
Expand Down Expand Up @@ -75,7 +75,8 @@ def train100():
:rtype: callable
"""
return reader_creator(
download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
'train')


def test100():
Expand All @@ -88,7 +89,9 @@ def test100():
:return: Test reader creator.
:rtype: callable
"""
return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
return reader_creator(
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
'test')


def train10():
Expand All @@ -102,7 +105,8 @@ def train10():
:rtype: callable
"""
return reader_creator(
download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
'data_batch')


def test10():
Expand All @@ -116,9 +120,20 @@ def test10():
:rtype: callable
"""
return reader_creator(
download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
'test_batch')


def fetch():
download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)


def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100")
paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100")
paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10")
paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10")
5 changes: 4 additions & 1 deletion python/paddle/v2/dataset/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
import cPickle
import glob

__all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader']
__all__ = [
'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
'convert'
]

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')

Expand Down
38 changes: 26 additions & 12 deletions python/paddle/v2/dataset/conll05.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
import tarfile
import gzip
import itertools
from common import download
import paddle.v2.dataset.common

__all__ = ['test, get_dict', 'get_embedding']
__all__ = ['test, get_dict', 'get_embedding', 'convert']

DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc'
Expand Down Expand Up @@ -182,17 +182,23 @@ def get_dict():
"""
Get the word, verb and label dictionary of Wikipedia corpus.
"""
word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
word_dict = load_dict(
paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
WORDDICT_MD5))
verb_dict = load_dict(
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
VERBDICT_MD5))
label_dict = load_dict(
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
TRGDICT_MD5))
return word_dict, verb_dict, label_dict


def get_embedding():
"""
Get the trained word vector based on Wikipedia corpus.
"""
return download(EMB_URL, 'conll05st', EMB_MD5)
return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)


def test():
Expand All @@ -209,15 +215,23 @@ def test():
"""
word_dict, verb_dict, label_dict = get_dict()
reader = corpus_reader(
download(DATA_URL, 'conll05st', DATA_MD5),
paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
return reader_creator(reader, word_dict, verb_dict, label_dict)


def fetch():
download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
download(EMB_URL, 'conll05st', EMB_MD5)
download(DATA_URL, 'conll05st', DATA_MD5)
paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)


def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train")
paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test")
11 changes: 10 additions & 1 deletion python/paddle/v2/dataset/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import string
import threading

__all__ = ['build_dict', 'train', 'test']
__all__ = ['build_dict', 'train', 'test', 'convert']

URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
Expand Down Expand Up @@ -166,3 +166,12 @@ def word_dict():

def fetch():
paddle.v2.dataset.common.download(URL, 'imdb', MD5)


def convert(path):
"""
Converts dataset to recordio format
"""
w = word_dict()
paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train")
paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test")
14 changes: 13 additions & 1 deletion python/paddle/v2/dataset/imikolov.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import collections
import tarfile

__all__ = ['train', 'test', 'build_dict']
__all__ = ['train', 'test', 'build_dict', 'convert']

URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
MD5 = '30177ea32e27c525793142b6bf2c8e2d'
Expand Down Expand Up @@ -146,3 +146,15 @@ def test(word_idx, n, data_type=DataType.NGRAM):

def fetch():
paddle.v2.dataset.common.download(URL, "imikolov", MD5)


def convert(path):
"""
Converts dataset to recordio format
"""
N = 5
word_dict = build_dict()
paddle.v2.dataset.common.convert(path,
train(word_dict, N), 10, "imikolov_train")
paddle.v2.dataset.common.convert(path,
test(word_dict, N), 10, "imikolov_test")
10 changes: 9 additions & 1 deletion python/paddle/v2/dataset/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import subprocess
import numpy
import platform
__all__ = ['train', 'test']
__all__ = ['train', 'test', 'convert']

URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
Expand Down Expand Up @@ -113,3 +113,11 @@ def fetch():
paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)


def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train(), 10, "minist_train")
paddle.v2.dataset.common.convert(path, test(), 10, "minist_test")
17 changes: 13 additions & 4 deletions python/paddle/v2/dataset/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@
"""

import zipfile
from common import download
import paddle.v2.dataset.common
import re
import random
import functools

__all__ = [
'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
'convert'
]

age_table = [1, 18, 25, 35, 45, 50, 56]
Expand Down Expand Up @@ -99,7 +100,7 @@ def __repr__(self):


def __initialize_meta_info__():
fn = download(URL, "movielens", MD5)
fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
global MOVIE_INFO
if MOVIE_INFO is None:
pattern = re.compile(r'^(.*)\((\d+)\)$')
Expand Down Expand Up @@ -246,7 +247,15 @@ def unittest():


def fetch():
download(URL, "movielens", MD5)
paddle.v2.dataset.common.download(URL, "movielens", MD5)


def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train")
paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test")


if __name__ == '__main__':
Expand Down
22 changes: 16 additions & 6 deletions python/paddle/v2/dataset/sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
import nltk
from nltk.corpus import movie_reviews

import common
import paddle.v2.dataset.common

__all__ = ['train', 'test', 'get_word_dict']
__all__ = ['train', 'test', 'get_word_dict', 'convert']
NUM_TRAINING_INSTANCES = 1600
NUM_TOTAL_INSTANCES = 2000

Expand All @@ -39,12 +39,13 @@ def download_data_if_not_yet():
"""
try:
# make sure that nltk can find the data
if common.DATA_HOME not in nltk.data.path:
nltk.data.path.append(common.DATA_HOME)
if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
movie_reviews.categories()
except LookupError:
print "Downloading movie_reviews data set, please wait....."
nltk.download('movie_reviews', download_dir=common.DATA_HOME)
nltk.download(
'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
print "Download data set success....."
print "Path is " + nltk.data.find('corpora/movie_reviews').path

Expand Down Expand Up @@ -128,4 +129,13 @@ def test():


def fetch():
nltk.download('movie_reviews', download_dir=common.DATA_HOME)
nltk.download(
'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)


def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train")
paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test")
20 changes: 14 additions & 6 deletions python/paddle/v2/dataset/uci_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@
"""
UCI Housing dataset.

This module will download dataset from
This module will paddle.v2.dataset.common.download dataset from
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
parse training set and test set into paddle reader creators.
"""

import numpy as np
import os
from common import download
import paddle.v2.dataset.common

__all__ = ['train', 'test']

URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
MD5 = 'd4accdce7a25600298819f8e28e8d593'
feature_names = [
'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT'
'PTRATIO', 'B', 'LSTAT', 'convert'
]

UCI_TRAIN_DATA = None
Expand Down Expand Up @@ -82,7 +82,7 @@ def train():
:rtype: callable
"""
global UCI_TRAIN_DATA
load_data(download(URL, 'uci_housing', MD5))
load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))

def reader():
for d in UCI_TRAIN_DATA:
Expand All @@ -102,7 +102,7 @@ def test():
:rtype: callable
"""
global UCI_TEST_DATA
load_data(download(URL, 'uci_housing', MD5))
load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))

def reader():
for d in UCI_TEST_DATA:
Expand All @@ -112,4 +112,12 @@ def reader():


def fetch():
download(URL, 'uci_housing', MD5)
paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)


def convert(path):
"""
Converts dataset to recordio format
"""
paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train")
paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test")
Loading