Skip to content

Commit

Permalink
Merge pull request #206 from socialcopsdev/stream-nurminen-detection
Browse files Browse the repository at this point in the history
[MRG] Add implementation of Anssi Nurminen's table detection algorithm
  • Loading branch information
Vinayak Mehta authored Nov 23, 2018
2 parents cd3aa38 + 23ec6b5 commit e7835ca
Show file tree
Hide file tree
Showing 13 changed files with 534 additions and 144 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ install:
pip install ".[dev]"

test:
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl

docs:
cd docs && make html
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ $ conda install -c conda-forge camelot-py

### Using pip

After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:

<pre>
$ pip install camelot-py[cv]
Expand Down Expand Up @@ -128,4 +128,4 @@ Camelot uses [Semantic Versioning](https://semver.org/). For the available versi

## License

This project is licensed under the MIT License, see the [LICENSE](https://github.com/socialcopsdev/camelot/blob/master/LICENSE) file for details.
This project is licensed under the MIT License, see the [LICENSE](https://github.com/socialcopsdev/camelot/blob/master/LICENSE) file for details.
16 changes: 14 additions & 2 deletions camelot/__version__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
# -*- coding: utf-8 -*-

VERSION = (0, 3, 2)
VERSION = (0, 4, 0)
PRERELEASE = None # alpha, beta or rc
REVISION = None


def generate_version(version, prerelease=None, revision=None):
version_parts = ['.'.join(map(str, version))]
if prerelease is not None:
version_parts.append('-{}'.format(prerelease))
if revision is not None:
version_parts.append('.{}'.format(revision))
return ''.join(version_parts)


__title__ = 'camelot-py'
__description__ = 'PDF Table Extraction for Humans.'
__url__ = 'http://camelot-py.readthedocs.io/'
__version__ = '.'.join(map(str, VERSION))
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
__author__ = 'Vinayak Mehta'
__author_email__ = 'vmehta94@gmail.com'
__license__ = 'MIT License'
199 changes: 199 additions & 0 deletions camelot/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,210 @@
import os
import zipfile
import tempfile
from itertools import chain
from operator import itemgetter

import numpy as np
import pandas as pd


# minimum number of vertical textline intersections for a textedge
# to be considered valid
TEXTEDGE_REQUIRED_ELEMENTS = 4
# y coordinate tolerance for extending textedge
TEXTEDGE_EXTEND_TOLERANCE = 50
# padding added to table area on the left, right and bottom
TABLE_AREA_PADDING = 10


class TextEdge(object):
"""Defines a text edge coordinates relative to a left-bottom
origin. (PDF coordinate space)
Parameters
----------
x : float
x-coordinate of the text edge.
y0 : float
y-coordinate of bottommost point.
y1 : float
y-coordinate of topmost point.
align : string, optional (default: 'left')
{'left', 'right', 'middle'}
Attributes
----------
intersections: int
Number of intersections with horizontal text rows.
is_valid: bool
A text edge is valid if it intersections with at least
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
"""
def __init__(self, x, y0, y1, align='left'):
self.x = x
self.y0 = y0
self.y1 = y1
self.align = align
self.intersections = 0
self.is_valid = False

def __repr__(self):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)

def update_coords(self, x, y0):
"""Updates the text edge's x and bottom y coordinates and sets
the is_valid attribute.
"""
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0
self.intersections += 1
# a textedge is valid only if it extends uninterrupted
# over a required number of textlines
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
self.is_valid = True


class TextEdges(object):
"""Defines a dict of left, right and middle text edges found on
the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects.
"""
def __init__(self):
self._textedges = {'left': [], 'right': [], 'middle': []}

@staticmethod
def get_x_coord(textline, align):
"""Returns the x coordinate of a text row based on the
specified alignment.
"""
x_left = textline.x0
x_right = textline.x1
x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
return x_coord[align]

def find(self, x_coord, align):
"""Returns the index of an existing text edge using
the specified x coordinate and alignment.
"""
for i, te in enumerate(self._textedges[align]):
if np.isclose(te.x, x_coord, atol=0.5):
return i
return None

def add(self, textline, align):
"""Adds a new text edge to the current dict.
"""
x = self.get_x_coord(textline, align)
y0 = textline.y0
y1 = textline.y1
te = TextEdge(x, y0, y1, align=align)
self._textedges[align].append(te)

def update(self, textline):
"""Updates an existing text edge in the current dict.
"""
for align in ['left', 'right', 'middle']:
x_coord = self.get_x_coord(textline, align)
idx = self.find(x_coord, align)
if idx is None:
self.add(textline, align)
else:
self._textedges[align][idx].update_coords(x_coord, textline.y0)

def generate(self, textlines):
"""Generates the text edges dict based on horizontal text
rows.
"""
for tl in textlines:
if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update(tl)

def get_relevant(self):
"""Returns the list of relevant text edges (all share the same
alignment) based on which list intersects horizontal text rows
the most.
"""
intersections_sum = {
'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid)
}

# TODO: naive
# get vertical textedges that intersect maximum number of
# times with horizontal textlines
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align]

def get_table_areas(self, textlines, relevant_textedges):
"""Returns a dict of interesting table areas on the PDF page
calculated using relevant text edges.
"""
def pad(area, average_row_height):
x0 = area[0] - TABLE_AREA_PADDING
y0 = area[1] - TABLE_AREA_PADDING
x1 = area[2] + TABLE_AREA_PADDING
# add a constant since table headers can be relatively up
y1 = area[3] + average_row_height * 5
return (x0, y0, x1, y1)

# sort relevant textedges in reading order
relevant_textedges.sort(key=lambda te: (-te.y0, te.x))

table_areas = {}
for te in relevant_textedges:
if te.is_valid:
if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
found = None
for area in table_areas:
# check for overlap
if te.y1 >= area[1] and te.y0 <= area[3]:
found = area
break
if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None
else:
table_areas.pop(found)
updated_area = (
found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1))
table_areas[updated_area] = None

# extend table areas based on textlines that overlap
# vertically. it's possible that these textlines were
# eliminated during textedges generation since numbers and
# chars/words/sentences are often aligned differently.
# drawback: table areas that have paragraphs on their sides
# will include the paragraphs too.
sum_textline_height = 0
for tl in textlines:
sum_textline_height += tl.y1 - tl.y0
found = None
for area in table_areas:
# check for overlap
if tl.y0 >= area[1] and tl.y1 <= area[3]:
found = area
break
if found is not None:
table_areas.pop(found)
updated_area = (
min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1))
table_areas[updated_area] = None
average_textline_height = sum_textline_height / float(len(textlines))

# add some padding to table areas
table_areas_padded = {}
for area in table_areas:
table_areas_padded[pad(area, average_textline_height)] = None

return table_areas_padded


class Cell(object):
"""Defines a cell in a table with coordinates relative to a
left-bottom origin. (PDF coordinate space)
Expand Down
47 changes: 42 additions & 5 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd

from .base import BaseParser
from ..core import Table
from ..core import TextEdges, Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace)

Expand Down Expand Up @@ -116,7 +116,7 @@ def _group_rows(text, row_close_tol=2):
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # hacky
__ = rows.pop(0) # TODO: hacky
return rows

@staticmethod
Expand Down Expand Up @@ -246,6 +246,31 @@ def _validate_columns(self):
raise ValueError("Length of table_areas and columns"
" should be equal")

def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm
described by Anssi Nurminen's master's thesis.
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
Assumes that tables are situated relatively far apart
vertically.
"""

# TODO: add support for arabic text #141
# sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0))
textedges = TextEdges()
# generate left, middle and right textedges
textedges.generate(textlines)
# select relevant edges
relevant_textedges = textedges.get_relevant()
# guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found
if not len(table_bbox):
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}

return table_bbox

def _generate_table_bbox(self):
if self.table_areas is not None:
table_bbox = {}
Expand All @@ -257,7 +282,8 @@ def _generate_table_bbox(self):
y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None
else:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(self.horizontal_text)
self.table_bbox = table_bbox

def _generate_columns_and_rows(self, table_idx, tk):
Expand Down Expand Up @@ -286,10 +312,21 @@ def _generate_columns_and_rows(self, table_idx, tk):
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname)))
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if len(elements):
ncols = max(set(elements), key=elements.count)
else:
warnings.warn("No tables found in table area {}".format(
table_idx + 1))
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
inner_text = []
Expand Down
2 changes: 1 addition & 1 deletion docs/dev/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ If you're reading this, you're probably looking to contributing to Camelot. *Tim

This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to `Vinayak Mehta`_, the author and maintainer.

.. _Vinayak Mehta: https://vinayak-mehta.github.io
.. _Vinayak Mehta: https://www.vinayakmehta.com

Code Of Conduct
---------------
Expand Down
3 changes: 2 additions & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ This part of the documentation begins with some background information about why
:maxdepth: 2

user/intro
user/install-deps
user/install
user/how-it-works
user/quickstart
Expand All @@ -118,4 +119,4 @@ you.
.. toctree::
:maxdepth: 2

dev/contributing
dev/contributing
Loading

0 comments on commit e7835ca

Please sign in to comment.