Skip to content

Commit

Permalink
Fix no table found warning and add tests for two tables
Browse files Browse the repository at this point in the history
  • Loading branch information
vinayak-mehta committed Nov 23, 2018
1 parent bf89411 commit 1f71513
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ install:
pip install ".[dev]"

test:
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl

docs:
cd docs && make html
Expand Down
15 changes: 13 additions & 2 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,21 @@ def _generate_columns_and_rows(self, table_idx, tk):
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname)))
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if len(elements):
ncols = max(set(elements), key=elements.count)
else:
warnings.warn("No tables found in table area {}".format(
table_idx + 1))
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
inner_text = []
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
test=pytest

[tool:pytest]
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
python_files = tests/test_*.py
125 changes: 125 additions & 0 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,99 @@
["NFHS-1 (1992-93)", "57.7", "37.6", "26.5", "4.3", "3.6", "1.3", "0.1", "1.9", "na", "na", "11.3", "8.3", "na", "42.3", "100.0", "3,970"]
]

data_stream_two_tables_1 = [
["[In thousands (11,062.6 represents 11,062,600) For year ending December 31. Based on Uniform Crime Reporting (UCR)", "", "", "", "", "", "", "", "", ""],
["Program. Represents arrests reported (not charged) by 12,910 agencies with a total population of 247,526,916 as estimated", "", "", "", "", "", "", "", "", ""],
["by the FBI. Some persons may be arrested more than once during a year, therefore, the data in this table, in some cases,", "", "", "", "", "", "", "", "", ""],
["could represent multiple arrests of the same person. See text, this section and source]", "", "", "", "", "", "", "", "", ""],
["", "", "Total", "", "", "Male", "", "", "Female", ""],
["Offense charged", "", "Under 18", "18 years", "", "Under 18", "18 years", "", "Under 18", "18 years"],
["", "Total", "years", "and over", "Total", "years", "and over", "Total", "years", "and over"],
["Total . . . . . . . . . . . . . . . . . . . . . . . . .", "11,062 .6", "1,540 .0", "9,522 .6", "8,263 .3", "1,071 .6", "7,191 .7", "2,799 .2", "468 .3", "2,330 .9"],
["Violent crime . . . . . . . . . . . . . . . . . .", "467 .9", "69 .1", "398 .8", "380 .2", "56 .5", "323 .7", "87 .7", "12 .6", "75 .2"],
["Murder and nonnegligent", "", "", "", "", "", "", "", "", ""],
["manslaughter . . . . . . . .. .. .. .. ..", "10.0", "0.9", "9.1", "9.0", "0.9", "8.1", "1.1", "–", "1.0"],
["Forcible rape . . . . . . . .. .. .. .. .. .", "17.5", "2.6", "14.9", "17.2", "2.5", "14.7", "–", "–", "–"],
["Robbery . . . .. .. . .. . ... . ... . ...", "102.1", "25.5", "76.6", "90.0", "22.9", "67.1", "12.1", "2.5", "9.5"],
["Aggravated assault . . . . . . . .. .. ..", "338.4", "40.1", "298.3", "264.0", "30.2", "233.8", "74.4", "9.9", "64.5"],
["Property crime . . . . . . . . . . . . . . . . .", "1,396 .4", "338 .7", "1,057 .7", "875 .9", "210 .8", "665 .1", "608 .2", "127 .9", "392 .6"],
["Burglary . .. . . . . .. ... .... .... ..", "240.9", "60.3", "180.6", "205.0", "53.4", "151.7", "35.9", "6.9", "29.0"],
["Larceny-theft . . . . . . . .. .. .. .. .. .", "1,080.1", "258.1", "822.0", "608.8", "140.5", "468.3", "471.3", "117.6", "353.6"],
["Motor vehicle theft . . . . .. .. . .... .", "65.6", "16.0", "49.6", "53.9", "13.3", "40.7", "11.7", "2.7", "8.9"],
["Arson .. . . . .. . ... .... .... .... .", "9.8", "4.3", "5.5", "8.1", "3.7", "4.4", "1.7", "0.6", "1.1"],
["Other assaults .. . . . . .. . ... . ... ..", "1,061.3", "175.3", "886.1", "785.4", "115.4", "670.0", "276.0", "59.9", "216.1"],
["Forgery and counterfeiting .. . . . . . ..", "68.9", "1.7", "67.2", "42.9", "1.2", "41.7", "26.0", "0.5", "25.5"],
["Fraud .... .. . . .. ... .... .... ....", "173.7", "5.1", "168.5", "98.4", "3.3", "95.0", "75.3", "1.8", "73.5"],
["Embezzlement . . .. . . . .. . ... . ....", "14.6", "–", "14.1", "7.2", "–", "6.9", "7.4", "–", "7.2"],
["Stolen property 1 . . . . . . .. . .. .. ...", "84.3", "15.1", "69.2", "66.7", "12.2", "54.5", "17.6", "2.8", "14.7"],
["Vandalism . . . . . . . .. .. .. .. .. ....", "217.4", "72.7", "144.7", "178.1", "62.8", "115.3", "39.3", "9.9", "29.4"],
["Weapons; carrying, possessing, etc. .", "132.9", "27.1", "105.8", "122.1", "24.3", "97.8", "10.8", "2.8", "8.0"],
["Prostitution and commercialized vice",
"56.9", "1.1", "55.8", "17.3", "–", "17.1", "39.6", "0.8", "38.7"],
["Sex offenses 2 . . . . .. . . . .. .. .. . ..", "61.5", "10.7", "50.7", "56.1", "9.6", "46.5", "5.4", "1.1", "4.3"],
["Drug abuse violations . . . . . . . .. ...", "1,333.0", "136.6", "1,196.4", "1,084.3", "115.2", "969.1", "248.7", "21.4", "227.3"],
["Gambling .. . . . . .. ... . ... . ... ...", "8.2", "1.4", "6.8", "7.2", "1.4", "5.9", "0.9", "–", "0.9"],
["Offenses against the family and", "", "", "", "", "", "", "", "", ""],
["children . . . .. . . .. .. .. .. .. .. . ..", "92.4", "3.7", "88.7", "68.9", "2.4", "66.6", "23.4", "1.3", "22.1"],
["Driving under the influence . . . . . .. .", "1,158.5", "109.2", "1,147.5", "895.8", "8.2", "887.6", "262.7", "2.7", "260.0"],
["Liquor laws . . . . . . . .. .. .. .. .. .. .", "48.2", "90.2", "368.0", "326.8", "55.4", "271.4",
"131.4", "34.7", "96.6"],
["Drunkenness . . .. . . . .. . ... . ... ..", "488.1", "11.4", "476.8", "406.8", "8.5", "398.3", "81.3", "2.9", "78.4"],
["Disorderly conduct . .. . . . . . .. .. .. .", "529.5", "136.1", "393.3", "387.1", "90.8", "296.2", "142.4", "45.3", "97.1"],
["Vagrancy . . . .. . . . ... .... .... ...", "26.6", "2.2", "24.4", "20.9", "1.6", "19.3", "5.7", "0.6", "5.1"],
["All other offenses (except traffic) . . ..", "306.1", "263.4", "2,800.8", "2,337.1", "194.2", "2,142.9", "727.0", "69.2", "657.9"],
["Suspicion . . . .. . . .. .. .. .. .. .. . ..", "1.6", "–", "1.4", "1.2", "–", "1.0", "–", "–", "–"],
["Curfew and loitering law violations ..", "91.0", "91.0", "(X)", "63.1", "63.1", "(X)", "28.0", "28.0", "(X)"],
["Runaways . . . . . . . .. .. .. .. .. ....", "75.8", "75.8", "(X)", "34.0", "34.0", "(X)", "41.8", "41.8", "(X)"],
["", "– Represents zero. X Not applicable. 1 Buying, receiving, possessing stolen property. 2 Except forcible rape and prostitution.", "", "", "", "", "", "", "", ""],
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", "", "", "", "", ""]
]

data_stream_two_tables_2 = [
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, Uniform Crime Reports, Arrests Master Files.", "", "", "", ""],
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
["[Based on Uniform Crime Reporting (UCR) Program. Represents arrests reported (not charged) by 12,371 agencies", "", "", "", "", ""],
["with a total population of 239,839,971 as estimated by the FBI. See headnote, Table 324]", "", "", "", "", ""],
["", "", "", "", "American", ""],
["Offense charged", "", "", "",
"Indian/Alaskan", "Asian Pacific"],
["", "Total", "White", "Black", "Native", "Islander"],
["Total . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "10,690,561", "7,389,208", "3,027,153", "150,544", "123,656"],
["Violent crime . . . . . . . . . . . . . . . . . . . . . . . . . . . .", "456,965", "268,346", "177,766", "5,608", "5,245"],
["Murder and nonnegligent manslaughter . .. ... .", "9,739", "4,741", "4,801", "100", "97"],
["Forcible rape . . . . . . . .. .. .. .. .... .. ...... .", "16,362", "10,644", "5,319", "169", "230"],
["Robbery . . . . .. . . . ... . ... . .... .... .... . . .", "100,496", "43,039", "55,742", "726", "989"],
["Aggravated assault . . . . . . . .. .. ...... .. ....", "330,368", "209,922", "111,904", "4,613", "3,929"],
["Property crime . . . . . . . . . . . . . . . . . . . . . . . . . . .", "1,364,409", "922,139", "406,382", "17,599", "18,289"],
["Burglary . . .. . . . .. . .... .... .... .... ... . . .", "234,551", "155,994", "74,419", "2,021", "2,117"],
["Larceny-theft . . . . . . . .. .. .. .. .... .. ...... .", "1,056,473", "719,983", "306,625", "14,646", "15,219"],
["Motor vehicle theft . . . . . .. ... . ... ..... ... ..", "63,919", "39,077", "23,184", "817", "841"],
["Arson .. . . .. .. .. ... .... .... .... .... . . . . .", "9,466", "7,085", "2,154", "115", "112"],
["Other assaults .. . . . . . ... . ... . ... ..... ... ..", "1,032,502", "672,865", "332,435", "15,127", "12,075"],
["Forgery and counterfeiting .. . . . . . ... ..... .. ..", "67,054", "44,730", "21,251", "345", "728"],
["Fraud ... . . . . .. .. .. .. .. .. .. .. .. .... . . . . . .", "161,233", "108,032", "50,367", "1,315", "1,519"],
["Embezzlement . . . .. . . . ... . ... . .... ... .....", "13,960", "9,208", "4,429", "75", "248"],
["Stolen property; buying, receiving, possessing .. .", "82,714", "51,953", "29,357", "662", "742"],
["Vandalism . . . . . . . .. .. .. .. .. .. .... .. ..... .", "212,173", "157,723", "48,746", "3,352", "2,352"],
["Weapons—carrying, possessing, etc. .. .. ... .. .", "130,503", "74,942", "53,441", "951", "1,169"],
["Prostitution and commercialized vice . ... .. .. ..", "56,560", "31,699", "23,021", "427", "1,413"],
["Sex offenses 1 . . . . . . . .. .. .. .. .... .. ...... .", "60,175", "44,240", "14,347", "715", "873"],
["Drug abuse violations . . . . . . . .. . ..... .. .....", "1,301,629", "845,974", "437,623", "8,588", "9,444"],
["Gambling . . . . .. . . . ... . ... . .. ... . ...... .. .", "8,046", "2,290", "5,518", "27", "211"],
["Offenses against the family and children ... .. .. .", "87,232", "58,068", "26,850", "1,690", "624"],
["Driving under the influence . . . . . . .. ... ...... .", "1,105,401", "954,444", "121,594", "14,903", "14,460"],
["Liquor laws . . . . . . . .. .. .. .. .. . ..... .. .....", "444,087", "373,189", "50,431", "14,876", "5,591"],
["Drunkenness . .. . . . . . ... . ... . ..... . .......", "469,958", "387,542", "71,020", "8,552", "2,844"],
["Disorderly conduct . . .. . . . . .. .. . ..... .. .....", "515,689", "326,563", "176,169", "8,783", "4,174"],
["Vagrancy . . .. .. . . .. ... .... .... .... .... . . .", "26,347", "14,581", "11,031", "543", "192"],
["All other offenses (except traffic) . .. .. .. ..... ..", "2,929,217", "1,937,221", "911,670", "43,880", "36,446"],
["Suspicion . . .. . . . .. .. .. .. .. .. .. ...... .. . . .", "1,513", "677", "828", "1", "7"],
["Curfew and loitering law violations . .. ... .. ....", "89,578", "54,439", "33,207", "872", "1,060"],
["Runaways . . . . . . . .. .. .. .. .. .. .... .. ..... .", "73,616", "48,343", "19,670", "1,653", "3,950"],
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
["", "Source: U.S. Department of Justice, Federal Bureau of Investigation, “Crime in the United States, Arrests,” September 2010,", "", "", "", ""]
]

data_stream_table_areas = [
["", "One Withholding"],
["Payroll Period", "Allowance"],
Expand Down Expand Up @@ -248,6 +341,38 @@
["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"]
]

data_lattice_two_tables_1 = [
["State", "n", "Literacy Status", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
["Kerala", "2400", "7.2", "0.5", "25.3", "20.1", "41.5", "5.5"],
["Tamil Nadu", "2400", "21.4", "2.3", "8.8", "35.5", "25.8", "6.2"],
["Karnataka", "2399", "37.4", "2.8", "12.5", "18.3", "23.1", "5.8"],
["Andhra Pradesh", "2400", "54.0", "1.7", "8.4", "13.2", "18.8", "3.9"],
["Maharashtra", "2400", "22.0", "0.9", "17.3", "20.3", "32.6", "7.0"],
["Gujarat", "2390", "28.6", "0.1", "14.4", "23.1", "26.9", "6.8"],
["Madhya Pradesh", "2402", "29.1", "3.4", "8.5", "35.1", "13.3", "10.6"],
["Orissa", "2405", "33.2", "1.0", "10.4", "25.7", "21.2", "8.5"],
["West Bengal", "2293", "41.7", "4.4", "13.2", "17.1", "21.2", "2.4"],
["Uttar Pradesh", "2400", "35.3", "2.1", "4.5", "23.3", "27.1", "7.6"],
["Pooled", "23889", "30.9", "1.9", "12.3", "23.2", "25.2", "6.4"]
]

data_lattice_two_tables_2 = [
["State", "n", "Literacy Status", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College"],
["Kerala", "2400", "8.8", "0.3", "20.1", "17.0", "45.6", "8.2"],
["Tamil Nadu", "2400", "29.9", "1.5", "8.5", "33.1", "22.3", "4.8"],
["Karnataka", "2399", "47.9", "2.5", "10.2", "18.8", "18.4", "2.3"],
["Andhra Pradesh", "2400", "66.4", "0.7", "6.8", "12.9", "11.4", "1.8"],
["Maharashtra", "2400", "41.3", "0.6", "14.1", "20.1", "21.6", "2.2"],
["Gujarat", "2390", "57.6", "0.1", "10.3", "16.5", "12.9", "2.7"],
["Madhya Pradesh", "2402", "58.7", "2.2", "6.6", "24.1", "5.3", "3.0"],
["Orissa", "2405", "50.0", "0.9", "8.1", "21.9", "15.1", "4.0"],
["West Bengal", "2293", "49.1", "4.8", "11.2", "16.8", "17.1", "1.1"],
["Uttar Pradesh", "2400", "67.3", "2.0", "3.1", "17.2", "7.7", "2.7"],
["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
]

data_lattice_table_areas = [
["", "", "", "", "", "", "", "", ""],
["State", "n", "Literacy Status", "", "", "", "", "", ""],
Expand Down
22 changes: 22 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ def test_stream_table_rotated():
assert df.equals(tables[0].df)


def test_stream_two_tables():
df1 = pd.DataFrame(data_stream_two_tables_1)
df2 = pd.DataFrame(data_stream_two_tables_2)

filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='stream')
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)


def test_stream_table_areas():
df = pd.DataFrame(data_stream_table_areas)

Expand Down Expand Up @@ -111,6 +122,17 @@ def test_lattice_table_rotated():
assert df.equals(tables[0].df)


def test_lattice_two_tables():
df1 = pd.DataFrame(data_lattice_two_tables_1)
df2 = pd.DataFrame(data_lattice_two_tables_2)

filename = os.path.join(testdir, "twotables_2.pdf")
tables = camelot.read_pdf(filename)
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)


def test_lattice_table_areas():
df = pd.DataFrame(data_lattice_table_areas)

Expand Down

0 comments on commit 1f71513

Please sign in to comment.