diff --git a/camelot/utils.py b/camelot/utils.py index 6b7eefdc..4ed5bff2 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -958,7 +958,6 @@ def flag_font_size( str The processed string with flagged super/subscripts. """ - # Determine size based on direction and collect text and size d: list[tuple[str, float]] = [] if direction == "horizontal": @@ -1174,46 +1173,43 @@ def _group_and_process_chars( def get_table_index( table, t, direction, split_text=False, flag_size=False, strip_text="" ): - """Get indices of the table cell. + """ + Get indices of the table cell. - Get the index of a table cell where given text object lies by + Get the index of a table cell where a given text object lies by comparing their y and x-coordinates. Parameters ---------- table : camelot.core.Table + The table structure containing rows and columns. t : object PDFMiner LTTextLine object. direction : string Direction of the PDFMiner LTTextLine object. split_text : bool, optional (default: False) - Whether or not to split a text line if it spans across - multiple cells. + Whether or not to split a text line if it spans across multiple cells. flag_size : bool, optional (default: False) - Whether or not to highlight a substring using - if its size is different from rest of the string. (Useful for - super and subscripts) + Whether to highlight a substring using if its size is different + from the rest of the string. strip_text : str, optional (default: '') - Characters that should be stripped from a string before - assigning it to a cell. + Characters that should be stripped from a string before assigning it to a cell. Returns ------- - indices : list - List of tuples of the form (r_idx, c_idx, text) where r_idx - and c_idx are row and column indices. - error : float - Assignment error, percentage of text area that lies outside - a cell. + list + List of tuples of the form (r_idx, c_idx, text) where r_idx and c_idx + are row and column indices, respectively. + float + Assignment error, percentage of text area that lies outside a cell. +-------+ | | | [Text bounding box] | | +-------+ - """ r_idx, c_idx = [-1] * 2 - for r in range(len(table.rows)): + for r in range(len(table.rows)): # noqa if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[ r ][1]: @@ -1230,13 +1226,53 @@ def get_table_index( text_range = (t.x0, t.x1) col_range = (table.cols[0][0], table.cols[-1][1]) warnings.warn( - f"{text} {text_range} does not lie in column range {col_range}" + f"{text} {text_range} does not lie in column range {col_range}", + stacklevel=1, ) r_idx = r c_idx = lt_col_overlap.index(max(lt_col_overlap)) break + if r_idx == -1: + return [], 1.0 # Return early if no valid row is found + + error = calculate_assignment_error(t, table, r_idx, c_idx) - # error calculation + if split_text: + return ( + split_textline( + table, t, direction, flag_size=flag_size, strip_text=strip_text + ), + error, + ) + text = t.get_text().strip("\n") + if flag_size: + return [ + (r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text)) + ], error + else: + return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error + + +def calculate_assignment_error(t, table, r_idx, c_idx): + """ + Calculate the assignment error for the given text object. + + Parameters + ---------- + t : object + PDFMiner LTTextLine object. + table : camelot.core.Table + The table structure containing rows and columns. + r_idx : int + Row index where the text object is located. + c_idx : int + Column index where the text object is located. + + Returns + ------- + float + The calculated assignment error. + """ y0_offset, y1_offset, x0_offset, x1_offset = [0] * 4 if t.y0 > table.rows[r_idx][0]: y0_offset = abs(t.y0 - table.rows[r_idx][0]) @@ -1246,32 +1282,13 @@ def get_table_index( x0_offset = abs(t.x0 - table.cols[c_idx][0]) if t.x1 > table.cols[c_idx][1]: x1_offset = abs(t.x1 - table.cols[c_idx][1]) + x = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) + charea = x * y error = ((x * (y0_offset + y1_offset)) + (y * (x0_offset + x1_offset))) / charea - - if split_text: - return ( - split_textline( - table, t, direction, flag_size=flag_size, strip_text=strip_text - ), - error, - ) - else: - if flag_size: - return ( - [ - ( - r_idx, - c_idx, - flag_font_size(t._objs, direction, strip_text=strip_text), - ) - ], - error, - ) - else: - return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error + return error def compute_accuracy(error_weights):