Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Commit

Permalink
Feature/dataframes (#113)
Browse files Browse the repository at this point in the history
Updated code and comments for API programming demos to use dataframe-like structures.
  • Loading branch information
mjaquiery committed Jul 6, 2023
1 parent e4a5569 commit 10bbce8
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 34 deletions.
28 changes: 22 additions & 6 deletions frontend/src/GetDatasetJulia.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ export default function GetDatasetJulia({dataset}) {
# Copyright (c) 2020-2023, The Chancellor, Masters and Scholars of the University
# of Oxford, and the 'Galv' Developers. All rights reserved.
# By Matt Jaquiery <matt.jaquiery@dtc.ox.ac.uk>
# Download datasets from the REST API.
# Downloads all data for all columns for the dataset and reads them
# into a Dict object. Data are under datasets[x] as DataFrames.
#
# Dataset and column metadata are under dataset_metadata[x] and
# column_metadata[x] respectively.
using HTTP
using JSON
Expand All @@ -45,7 +54,9 @@ headers = Dict{String, String}("Authorization" => "Bearer ${token}")
verbose = true
dataset_ids = [${dataset.id}]
api_data = Dict{Int, Dict{String, Any}}()
dataset_metadata = Dict{Int64, Dict{String, Any}}()
column_metadata = Dict{Int64, Dict{Int64, Any}}()
datasets = Dict{Int64, DataFrame}()
function vprintln(s)
if verbose
Expand All @@ -68,7 +79,7 @@ function get_column_values(dataset_id, column)
try
body = String(response.body)
str_values = split(body, '\n')
str_values = split(body, '\\n')
values = Vector{String}(str_values[begin:end-1])
if dtype == "float"
return map((x -> parse(Float64, x)), values)
Expand Down Expand Up @@ -100,7 +111,8 @@ function get_column(dataset_id, url)
# Download column values
values = get_column_values(dataset_id, column)
pop!(column, "values", "")
column["values"] = values
datasets[dataset_id][!, column["name"]] = values
return column
end
Expand All @@ -117,16 +129,19 @@ function get_dataset(id)
println("Error parsing JSON for dataset $id")
return
end
api_data[id] = body
dataset_metadata[id] = body
# Download columns
columns = api_data[id]["columns"]
columns = dataset_metadata[id]["columns"]
len = length(columns)
vprintln("Downloading $len columns for dataset $id")
datasets[id] = DataFrame()
column_metadata[id] = Dict{Int64, Any}()
for (i, col) in enumerate(columns)
timings = @timed column = get_column(id, col)
api_data[id]["columns"][i] = column
column_metadata[id][i] = column
n = column["name"]
s = round(timings.time, digits = 2)
vprintln("Column $n completed in $s seconds")
Expand All @@ -142,6 +157,7 @@ for id in dataset_ids
end
vprintln("All datasets complete.")
`
}</SyntaxHighlighter>
)
Expand Down
40 changes: 20 additions & 20 deletions frontend/src/GetDatasetMatlab.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@ export default function GetDatasetMatlab({dataset}) {
%
% Download datasets from the REST API.
% Downloads all data for all columns for the dataset and reads them
% into a struct object. Data are under data{x}.columns.data.
% into a cell array. Data are under datasets{x} as Tables.
% Column names are coerced to valid MATLAB variable names using
% matlab.lang.makeValidName.
%
% Dataset and column metadata are under dataset_metadata{x} and
% column_metadata{x} respectively.
%
% SPDX-License-Identifier: BSD-2-Clause
% Copyright (c) 2020-2023, The Chancellor, Masters and Scholars of the University
Expand All @@ -33,49 +38,44 @@ token = '${token}';
apiURL = '${host}datasets';
options = weboptions('HeaderFields', {'Authorization' ['Bearer ' token]});
% Datasets can be referenced by name or by id.
% Only the id is guaranteed to be unique.
% Datasets are referenced by id.
% You can add in additional dataset_names or dataset_ids to also
% fetch the contents of those datasets.
dataset_names = [];
dataset_ids = [${dataset.id}]; % add additional dataset ids here if required
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
n = max(dataset_ids);
dataset_metadata = cell(n, 1);
column_metadata = cell(n, 1);
datasets = cell(n, 1);
% look up dataset ids if names provided
if exist('dataset_names', 'var') && ~ isempty(dataset_names)
dataset_meta = webread(apiURL, options);
dataset_ids = [dataset_ids dataset_meta(ismember({dataset_meta.name}, dataset_names)).id];
end
dataset_ids = string(dataset_ids(:));
dataset_ids = unique(dataset_ids);
data(1, length(dataset_ids)) = struct();
for i = 1:length(dataset_ids)
d = dataset_ids(i);
% get data
dsURL = strcat(apiURL, '/', d, '/');
dsURL = strcat(apiURL, '/', num2str(d), '/');
meta = webread(dsURL, options);
col_data = {};
dataset_metadata{d} = meta;
column_metadata{i} = cell(length(meta.columns), 1);
datasets{i} = table();
% append column data in columns
for c = 1:length(meta.columns)
cURL = meta.columns{c};
stream = webread(cURL, options);
meta.column_details{c} = stream;
column_metadata{i}{c} = stream;
column_content = webread(stream.values, options);
% drop final newline
column_content = regexprep(column_content, '\n$', '');
column_content = strsplit(column_content, '\n');
column_content = arrayfun(@(c) str2num(c{1}), column_content);
col_data{c} = column_content;
datasets{i}.(matlab.lang.makeValidName(stream.name)) = rot90(column_content, -1);
end
data(i).columns = col_data;
end`
end
`

return (
<React.Fragment>
Expand Down
34 changes: 26 additions & 8 deletions frontend/src/GetDatasetPython.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,16 @@ export default function GetDatasetPython({dataset}) {
# Copyright (c) 2020-2023, The Chancellor, Masters and Scholars of the University
# of Oxford, and the 'Galv' Developers. All rights reserved.
import urllib3
# By Matt Jaquiery <matt.jaquiery@dtc.ox.ac.uk>
# Download datasets from the REST API.
# Downloads all data for all columns for the dataset and reads them
# into a Dict object. Data are under datasets[x] as DataFrames.
#
# Dataset and column metadata are under dataset_metadata[x] and
# column_metadata[x] respectively.
import urllib3 # install via pip if not available
host = "${host}"
headers = {'Authorization': 'Bearer ${token}'}
Expand All @@ -47,17 +56,18 @@ if verbose:
# Add additional dataset ids to download additional datasets
dataset_ids = [${dataset.id}]
api_data = {}
dataset_metadata = {} # Will have keys=dataset_id, values=Dict of dataset metadata
column_metadata = {} # Will have keys=dataset_id, values=Dict of column metadata
datasets = {} # Will have keys=dataset_id, values=pandas DataFrame of data
# Download data
start_time = time.time()
if verbose:
start_time = time.time()
print(f"Downloading {len(dataset_ids)} datasets from {host}")
for dataset_id in dataset_ids:
dataset_start_time = time.time()
if verbose:
dataset_start_time = time.time()
print(f"Downloading dataset {dataset_id}")
r = urllib3.request('GET', f"{host}/datasets/{dataset_id}/", headers=headers)
try:
Expand All @@ -70,10 +80,13 @@ for dataset_id in dataset_ids:
continue
columns = json.get('columns', [])
json['columns'] = []
api_data[dataset_id] = json
dataset_metadata[dataset_id] = json
if verbose:
print(f"Dataset {dataset_id} has {len(columns)} columns to download")
# Download the data from all columns in the dataset
datasets[dataset_id] = pandas.DataFrame()
for i, column in enumerate(columns):
if verbose:
print(f"Downloading dataset {dataset_id} column {i}")
Expand All @@ -92,15 +105,20 @@ for dataset_id in dataset_ids:
if v.status != 200:
print(f"Error downloading values for dataset {dataset_id} column {json.get('name')}: {v.status}")
continue
json['values'] = v.data.decode('utf-8').split('\n')
try:
datasets[dataset_id][json.get('name')] = v.data.decode('utf-8').split('\\n')
except:
print(f"Cannot translate JSON response into DataFrame for column values {json.get['values']}")
continue
api_data[dataset_id]['columns'].append(json)
column_metadata[dataset_id] = json
if verbose:
print(f"Finished downloading dataset {dataset_id} in {time.time() - dataset_start_time} seconds")
if verbose:
print(f"Finished downloading {len(dataset_ids)} datasets in {time.time() - start_time} seconds")
`
}</SyntaxHighlighter>
)
Expand Down

0 comments on commit 10bbce8

Please sign in to comment.