Merge pull request #206 from socialcopsdev/stream-nurminen-detection

[MRG] Add implementation of Anssi Nurminen's table detection algorithm
atlanhq · Nov 23, 2018 · e7835ca · e7835ca
2 parents cd3aa38 + 23ec6b5
commit e7835ca
Show file tree

Hide file tree

Showing 13 changed files with 534 additions and 144 deletions.
diff --git a/Makefile b/Makefile
@@ -15,7 +15,7 @@ install:
 	pip install ".[dev]"
 
 test:
-	pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
+	pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
 
 docs:
 	cd docs && make html

diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ $ conda install -c conda-forge camelot-py
 
 ### Using pip
 
-After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
+After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install-deps.html) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
 
 <pre>
 $ pip install camelot-py[cv]
@@ -128,4 +128,4 @@ Camelot uses [Semantic Versioning](https://semver.org/). For the available versi
 
 ## License
 
-This project is licensed under the MIT License, see the [LICENSE](https://github.com/socialcopsdev/camelot/blob/master/LICENSE) file for details.
+This project is licensed under the MIT License, see the [LICENSE](https://github.com/socialcopsdev/camelot/blob/master/LICENSE) file for details.
diff --git a/camelot/__version__.py b/camelot/__version__.py
@@ -1,11 +1,23 @@
 # -*- coding: utf-8 -*-
 
-VERSION = (0, 3, 2)
+VERSION = (0, 4, 0)
+PRERELEASE = None # alpha, beta or rc
+REVISION = None
+
+
+def generate_version(version, prerelease=None, revision=None):
+    version_parts = ['.'.join(map(str, version))]
+    if prerelease is not None:
+        version_parts.append('-{}'.format(prerelease))
+    if revision is not None:
+        version_parts.append('.{}'.format(revision))
+    return ''.join(version_parts)
+
 
 __title__ = 'camelot-py'
 __description__ = 'PDF Table Extraction for Humans.'
 __url__ = 'http://camelot-py.readthedocs.io/'
-__version__ = '.'.join(map(str, VERSION))
+__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
 __author__ = 'Vinayak Mehta'
 __author_email__ = 'vmehta94@gmail.com'
 __license__ = 'MIT License'
diff --git a/camelot/core.py b/camelot/core.py
@@ -3,11 +3,210 @@
 import os
 import zipfile
 import tempfile
+from itertools import chain
+from operator import itemgetter
 
 import numpy as np
 import pandas as pd
 
 
+# minimum number of vertical textline intersections for a textedge
+# to be considered valid
+TEXTEDGE_REQUIRED_ELEMENTS = 4
+# y coordinate tolerance for extending textedge
+TEXTEDGE_EXTEND_TOLERANCE = 50
+# padding added to table area on the left, right and bottom
+TABLE_AREA_PADDING = 10
+
+
+class TextEdge(object):
+    """Defines a text edge coordinates relative to a left-bottom
+    origin. (PDF coordinate space)
+
+    Parameters
+    ----------
+    x : float
+        x-coordinate of the text edge.
+    y0 : float
+        y-coordinate of bottommost point.
+    y1 : float
+        y-coordinate of topmost point.
+    align : string, optional (default: 'left')
+        {'left', 'right', 'middle'}
+
+    Attributes
+    ----------
+    intersections: int
+        Number of intersections with horizontal text rows.
+    is_valid: bool
+        A text edge is valid if it intersections with at least
+        TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
+
+    """
+    def __init__(self, x, y0, y1, align='left'):
+        self.x = x
+        self.y0 = y0
+        self.y1 = y1
+        self.align = align
+        self.intersections = 0
+        self.is_valid = False
+
+    def __repr__(self):
+        return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
+            round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
+
+    def update_coords(self, x, y0):
+        """Updates the text edge's x and bottom y coordinates and sets
+        the is_valid attribute.
+        """
+        if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
+            self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
+            self.y0 = y0
+            self.intersections += 1
+            # a textedge is valid only if it extends uninterrupted
+            # over a required number of textlines
+            if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
+                self.is_valid = True
+
+
+class TextEdges(object):
+    """Defines a dict of left, right and middle text edges found on
+    the PDF page. The dict has three keys based on the alignments,
+    and each key's value is a list of camelot.core.TextEdge objects.
+    """
+    def __init__(self):
+        self._textedges = {'left': [], 'right': [], 'middle': []}
+
+    @staticmethod
+    def get_x_coord(textline, align):
+        """Returns the x coordinate of a text row based on the
+        specified alignment.
+        """
+        x_left = textline.x0
+        x_right = textline.x1
+        x_middle = x_left + (x_right - x_left) / 2.0
+        x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
+        return x_coord[align]
+
+    def find(self, x_coord, align):
+        """Returns the index of an existing text edge using
+        the specified x coordinate and alignment.
+        """
+        for i, te in enumerate(self._textedges[align]):
+            if np.isclose(te.x, x_coord, atol=0.5):
+                return i
+        return None
+
+    def add(self, textline, align):
+        """Adds a new text edge to the current dict.
+        """
+        x = self.get_x_coord(textline, align)
+        y0 = textline.y0
+        y1 = textline.y1
+        te = TextEdge(x, y0, y1, align=align)
+        self._textedges[align].append(te)
+
+    def update(self, textline):
+        """Updates an existing text edge in the current dict.
+        """
+        for align in ['left', 'right', 'middle']:
+            x_coord = self.get_x_coord(textline, align)
+            idx = self.find(x_coord, align)
+            if idx is None:
+                self.add(textline, align)
+            else:
+                self._textedges[align][idx].update_coords(x_coord, textline.y0)
+
+    def generate(self, textlines):
+        """Generates the text edges dict based on horizontal text
+        rows.
+        """
+        for tl in textlines:
+            if len(tl.get_text().strip()) > 1: # TODO: hacky
+                self.update(tl)
+
+    def get_relevant(self):
+        """Returns the list of relevant text edges (all share the same
+        alignment) based on which list intersects horizontal text rows
+        the most.
+        """
+        intersections_sum = {
+            'left': sum(te.intersections for te in self._textedges['left'] if te.is_valid),
+            'right': sum(te.intersections for te in self._textedges['right'] if te.is_valid),
+            'middle': sum(te.intersections for te in self._textedges['middle'] if te.is_valid)
+        }
+
+        # TODO: naive
+        # get vertical textedges that intersect maximum number of
+        # times with horizontal textlines
+        relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
+        return self._textedges[relevant_align]
+
+    def get_table_areas(self, textlines, relevant_textedges):
+        """Returns a dict of interesting table areas on the PDF page
+        calculated using relevant text edges.
+        """
+        def pad(area, average_row_height):
+            x0 = area[0] - TABLE_AREA_PADDING
+            y0 = area[1] - TABLE_AREA_PADDING
+            x1 = area[2] + TABLE_AREA_PADDING
+            # add a constant since table headers can be relatively up
+            y1 = area[3] + average_row_height * 5
+            return (x0, y0, x1, y1)
+
+        # sort relevant textedges in reading order
+        relevant_textedges.sort(key=lambda te: (-te.y0, te.x))
+
+        table_areas = {}
+        for te in relevant_textedges:
+            if te.is_valid:
+                if not table_areas:
+                    table_areas[(te.x, te.y0, te.x, te.y1)] = None
+                else:
+                    found = None
+                    for area in table_areas:
+                        # check for overlap
+                        if te.y1 >= area[1] and te.y0 <= area[3]:
+                            found = area
+                            break
+                    if found is None:
+                        table_areas[(te.x, te.y0, te.x, te.y1)] = None
+                    else:
+                        table_areas.pop(found)
+                        updated_area = (
+                            found[0], min(te.y0, found[1]), max(found[2], te.x), max(found[3], te.y1))
+                        table_areas[updated_area] = None
+
+        # extend table areas based on textlines that overlap
+        # vertically. it's possible that these textlines were
+        # eliminated during textedges generation since numbers and
+        # chars/words/sentences are often aligned differently.
+        # drawback: table areas that have paragraphs on their sides
+        # will include the paragraphs too.
+        sum_textline_height = 0
+        for tl in textlines:
+            sum_textline_height += tl.y1 - tl.y0
+            found = None
+            for area in table_areas:
+                # check for overlap
+                if tl.y0 >= area[1] and tl.y1 <= area[3]:
+                    found = area
+                    break
+            if found is not None:
+                table_areas.pop(found)
+                updated_area = (
+                    min(tl.x0, found[0]), min(tl.y0, found[1]), max(found[2], tl.x1), max(found[3], tl.y1))
+                table_areas[updated_area] = None
+        average_textline_height = sum_textline_height / float(len(textlines))
+
+        # add some padding to table areas
+        table_areas_padded = {}
+        for area in table_areas:
+            table_areas_padded[pad(area, average_textline_height)] = None
+
+        return table_areas_padded
+
+
 class Cell(object):
     """Defines a cell in a table with coordinates relative to a
     left-bottom origin. (PDF coordinate space)

diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
@@ -9,7 +9,7 @@
 import pandas as pd
 
 from .base import BaseParser
-from ..core import Table
+from ..core import TextEdges, Table
 from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
                      compute_whitespace)
 
@@ -116,7 +116,7 @@ def _group_rows(text, row_close_tol=2):
                     row_y = t.y0
                 temp.append(t)
         rows.append(sorted(temp, key=lambda t: t.x0))
-        __ = rows.pop(0)  # hacky
+        __ = rows.pop(0)  # TODO: hacky
         return rows
 
     @staticmethod
@@ -246,6 +246,31 @@ def _validate_columns(self):
                 raise ValueError("Length of table_areas and columns"
                                  " should be equal")
 
+    def _nurminen_table_detection(self, textlines):
+        """A general implementation of the table detection algorithm
+        described by Anssi Nurminen's master's thesis.
+        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
+
+        Assumes that tables are situated relatively far apart
+        vertically.
+        """
+
+        # TODO: add support for arabic text #141
+        # sort textlines in reading order
+        textlines.sort(key=lambda x: (-x.y0, x.x0))
+        textedges = TextEdges()
+        # generate left, middle and right textedges
+        textedges.generate(textlines)
+        # select relevant edges
+        relevant_textedges = textedges.get_relevant()
+        # guess table areas using textlines and relevant edges
+        table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
+        # treat whole page as table area if no table areas found
+        if not len(table_bbox):
+            table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
+
+        return table_bbox
+
     def _generate_table_bbox(self):
         if self.table_areas is not None:
             table_bbox = {}
@@ -257,7 +282,8 @@ def _generate_table_bbox(self):
                 y2 = float(y2)
                 table_bbox[(x1, y2, x2, y1)] = None
         else:
-            table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
+            # find tables based on nurminen's detection algorithm
+            table_bbox = self._nurminen_table_detection(self.horizontal_text)
         self.table_bbox = table_bbox
 
     def _generate_columns_and_rows(self, table_idx, tk):
@@ -286,10 +312,21 @@ def _generate_columns_and_rows(self, table_idx, tk):
             cols.append(text_x_max)
             cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
         else:
+            # calculate mode of the list of number of elements in
+            # each row to guess the number of columns
             ncols = max(set(elements), key=elements.count)
             if ncols == 1:
-                warnings.warn("No tables found on {}".format(
-                    os.path.basename(self.rootname)))
+                # if mode is 1, the page usually contains not tables
+                # but there can be cases where the list can be skewed,
+                # try to remove all 1s from list in this case and
+                # see if the list contains elements, if yes, then use
+                # the mode after removing 1s
+                elements = list(filter(lambda x: x != 1, elements))
+                if len(elements):
+                    ncols = max(set(elements), key=elements.count)
+                else:
+                    warnings.warn("No tables found in table area {}".format(
+                        table_idx + 1))
             cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
             cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol)
             inner_text = []

diff --git a/docs/dev/contributing.rst b/docs/dev/contributing.rst
@@ -7,7 +7,7 @@ If you're reading this, you're probably looking to contributing to Camelot. *Tim
 
 This document will help you get started with contributing documentation, code, testing and filing issues. If you have any questions, feel free to reach out to `Vinayak Mehta`_, the author and maintainer.
 
-.. _Vinayak Mehta: https://vinayak-mehta.github.io
+.. _Vinayak Mehta: https://www.vinayakmehta.com
 
 Code Of Conduct
 ---------------

diff --git a/docs/index.rst b/docs/index.rst
@@ -92,6 +92,7 @@ This part of the documentation begins with some background information about why
    :maxdepth: 2
 
    user/intro
+   user/install-deps
    user/install
    user/how-it-works
    user/quickstart
@@ -118,4 +119,4 @@ you.
 .. toctree::
    :maxdepth: 2
 
-   dev/contributing
+   dev/contributing