Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for bill of the House of Representatives of Japan #184

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ You can use datasets easily because you can access each dataset with multiple wa
* Geolonia Japanese Addresses
* Hepatitis
* House of Councillors of Japan
* House of Representatives of Japan
* Iris Dataset
* Libsvm
* MNIST database
Expand Down
16 changes: 16 additions & 0 deletions example/house-of-representative.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env ruby

require "datasets"

house_of_representative = Datasets::HouseOfRepresentative.new
house_of_representative.each do |record|
# Select support of one hundred or more members and promulgated
next unless 100 <= record.supporters_of_submitted_bill.size
next if record.promulgated_on.nil?

p [
record.supporters_of_submitted_bill.size,
record.promulgated_on,
record.title,
]
end
114 changes: 114 additions & 0 deletions lib/datasets/house-of-representative.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
require_relative "dataset"
require_relative "japanese-date-parser"

module Datasets
class HouseOfRepresentative < Dataset
Record = Struct.new(:carry_time,
:caption,
:type,
:submit_time,
:submit_number,
:title,
:discussion_status,
:progress,
:progress_url,
:text,
:text_url,
:bill_type,
:submitter,
:submitter_in_house_groups,
:house_of_representatives_of_accepted_bill_on_preliminary_consideration,
:house_of_representatives_of_preliminary_refer_on,
:house_of_representatives_of_preliminary_refer_commission,
:house_of_representatives_of_accepted_bill_on,
:house_of_representatives_of_refer_on,
:house_of_representatives_of_refer_commission,
:house_of_representatives_of_finished_consideration_on,
:house_of_representatives_of_consideration_result,
:house_of_representatives_of_finished_deliberation_on,
:house_of_representatives_of_deliberation_result,
:house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
:house_of_representatives_of_support_in_house_group_during_deliberation,
:house_of_representatives_of_opposition_in_house_group_during_deliberation,
:house_of_councillors_of_accepted_bill_on_preliminary_consideration,
:house_of_councillors_of_preliminary_refer_on,
:house_of_councillors_of_preliminary_refer_commission,
:house_of_councillors_of_accepted_bill_on,
:house_of_councillors_of_refer_on,
:house_of_councillors_of_refer_commission,
:house_of_councillors_of_finished_consideration_on,
:house_of_councillors_of_consideration_result,
:house_of_councillors_of_finished_deliberation_on,
:house_of_councillors_of_deliberation_result,
:promulgated_on,
:law_number,
:submitters,
:supporters_of_submitted_bill)

SPLIT_COLUMN_CHAR_ON_HEADER = "/".freeze
SPLIT_COLUMN_CHAR_ON_FIELD = SPLIT_COLUMN_CHAR_ON_HEADER

def initialize
super()

@metadata.id = "house-of-representative"
@metadata.name = "Bill of the House of Representatives of Japan"
@metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
@metadata.licenses = ["MIT"]
@metadata.description = "Bill of the House of Representatives of Japan"
end

def each
return to_enum(__method__) unless block_given?

open_data do |csv|
csv.each do |row|
row = split_csv_column(row)
row.fields.each_with_index do |field, idx|
row[idx] = JapaneseDateParser.new(field).parse
end
tikkss marked this conversation as resolved.
Show resolved Hide resolved
%w(議案提出会派 衆議院審議時賛成会派 衆議院審議時反対会派 議案提出者一覧 議案提出の賛成者).each do |array_column_name|
row[array_column_name] = parse_array(row[array_column_name])
end
tikkss marked this conversation as resolved.
Show resolved Hide resolved
record = Record.new(*row.fields)
yield(record)
end
end
end

private

def open_data
data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
data_path = cache_dir_path + "bill.csv"
tikkss marked this conversation as resolved.
Show resolved Hide resolved
download(data_path, data_url)

CSV.open(data_path, col_sep: ",", headers: true, converters: %i(integer)) do |csv|
yield(csv)
end
end

def parse_array(column_value)
column_value.to_s.split("; ")
tikkss marked this conversation as resolved.
Show resolved Hide resolved
end

def split_csv_column(row)
new_headers = row.headers
new_fields = row.fields
new_headers.each_with_index do |header, idx|
next unless header.include?(SPLIT_COLUMN_CHAR_ON_HEADER)

header.split(SPLIT_COLUMN_CHAR_ON_HEADER).tap do |substrings|
new_headers[idx] = substrings[0]
new_headers.insert(idx + 1, substrings[1])
end

new_fields[idx].split(SPLIT_COLUMN_CHAR_ON_FIELD).tap do |substrings|
new_fields[idx] = substrings[0]
new_fields.insert(idx + 1, substrings[1])
end
end
CSV::Row.new(new_headers, new_fields)
end
tikkss marked this conversation as resolved.
Show resolved Hide resolved
end
end
42 changes: 42 additions & 0 deletions lib/datasets/japanese-date-parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
module Datasets
class JapaneseDateParser
class UnsupportedEraInitialRange < StandardError; end
tikkss marked this conversation as resolved.
Show resolved Hide resolved

ERA_INITIALS = {
"平成" => "H",
"令和" => "R",
}.freeze

def initialize(string)
@string = string
end

def parse
case @string
when nil
nil
when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
match_data = Regexp.last_match
era_initial = ERA_INITIALS[match_data[1]]
if era_initial.nil?
message = "era must be one of ["
message << ERA_INITIALS.keys.join(", ")
message << "]: #{match_data[1]}"
raise UnsupportedEraInitialRange, message
end

year = match_data[2]
if year == "元"
year = "01"
else
year = year.rjust(2, "0")
end
month = match_data[3].rjust(2, "0")
day = match_data[4].rjust(2, "0")
Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
else
@string
end
end
end
end
1 change: 1 addition & 0 deletions lib/datasets/lazy.rb
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def const_missing(name)
LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
LAZY_LOADER.register(:Iris, "datasets/iris")
LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
Expand Down
23 changes: 23 additions & 0 deletions test/japanese-date-parser-test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
class JapaneseDateParserTest < Test::Unit::TestCase
data("month and day with leading a space in Heisei", ["H10.01.01", "平成10年 1月 1日"])
data("month with leading a space in Heisei", ["H10.01.10", "平成10年 1月10日"])
data(" day with leading a space in Heisei", ["H10.10.01", "平成10年10月 1日"])
data(" without leading a space in Heisei", ["H10.10.10", "平成10年10月10日"])
data("year, month and day with leading a space in Reiwa", ["R02.01.01", "令和 2年 1月 1日"])
data("year, month with leading a space in Reiwa", ["R02.01.10", "令和 2年 1月10日"])
data("year, day with leading a space in Reiwa", ["R02.10.01", "令和 2年10月 1日"])
data("year, without leading a space in Reiwa", ["R02.10.10", "令和 2年10月10日"])
data("boundary within Heisei", ["H31.04.30", "平成31年 4月30日"])
data("boundary within Reiwa", ["R01.05.01", "令和元年 5月 1日"])
test("#parse") do
expected_jisx0301, japanese_date_string = data
assert_equal(expected_jisx0301, Datasets::JapaneseDateParser.new(japanese_date_string).parse.jisx0301)
end

test("unsupported era initial range") do
expected_message = "era must be one of [平成, 令和]: 昭和"
assert_raise(Datasets::JapaneseDateParser::UnsupportedEraInitialRange.new(expected_message)) do
Datasets::JapaneseDateParser.new("昭和元年 1月 1日").parse
end
end
end
103 changes: 103 additions & 0 deletions test/test-house-of-representative.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
class HouseOfRepresentativeTest < Test::Unit::TestCase
def setup
@dataset = Datasets::HouseOfRepresentative.new
end

def record(*args)
Datasets::HouseOfRepresentative::Record.new(*args)
end

test("#each") do
records = @dataset.each.to_a
assert_equal([
10521,
record(142,
"衆法の一覧",
nil,
139,
18,
"市民活動促進法案",
"成立",
"経過",
"https://www.shugiin.go.jp/internet/itdb_gian.nsf/html/gian/keika/5516.htm",
nil,
nil,
"衆法",
"熊代 昭彦君外四名",
%w(自由民主党 社会民主党・市民連合 新党さきがけ),
nil,
nil,
nil,
Date.jisx0301("H10.03.04"),
Date.jisx0301("H10.03.11"),
"内閣",
Date.jisx0301("H10.03.17"),
"可決",
Date.jisx0301("H10.03.19"),
"可決",
nil,
[],
[],
nil,
nil,
nil,
nil,
Date.jisx0301("H10.01.12"),
"労働・社会政策",
Date.jisx0301("H10.03.03"),
"修正",
Date.jisx0301("H10.03.04"),
"修正",
Date.jisx0301("H10.03.25"),
"7",
[],
[]),
record(212,
"規則の一覧",
nil,
212,
1,
"衆議院規則の一部を改正する規則案",
"衆議院で閉会中審査",
"経過",
"https://www.shugiin.go.jp/internet/itdb_gian.nsf/html/gian/keika/1DDAB2A.htm",
nil,
nil,
"規則",
"遠藤 敬君外五名",
%w(日本維新の会 国民民主党・無所属クラブ 有志の会),
nil,
nil,
nil,
Date.jisx0301("R05.12.11"),
Date.jisx0301("R05.12.12"),
"議院運営",
nil,
nil,
"",
"閉会中審査",
nil,
[],
[],
nil,
nil,
nil,
nil,
nil,
nil,
nil,
nil,
nil,
nil,
nil,
nil,
%w(遠藤敬君 中司宏君 金村龍那君 古川元久君 浅野哲君 福島伸享君),
%w(足立康史君 阿部司君 阿部弘樹君 青柳仁士君 赤木正幸君 浅川義治君 井上英孝君 伊東信久君 池下卓君 池畑浩太朗君 一谷勇一郎君 市村浩一郎君 岩谷良平君 浦野靖人君 漆間譲司君 遠藤良太君 小野泰輔君 奥下剛光君 沢田良君 杉本和巳君 住吉寛紀君 空本誠喜君 高橋英明君 中嶋秀樹君 馬場伸幸君 早坂敦君 林佑美君 藤田文武君 藤巻健太君 堀場幸子君 掘井健智君 三木圭恵君 美延映夫君 岬麻紀君 守島正君 山本剛正君 吉田とも代君 和田有一朗君 鈴木義弘君 田中健君 玉木雄一郎君 長友慎治君 西岡秀子君 吉良州司君 北神圭朗君 緒方林太郎君)),
],
[
records.size,
records.first,
records.last,
])
end
end