Skip to content

Commit

Permalink
Backport PR #48334 on branch 1.5.x (BUG: read_html(extract_links=all)…
Browse files Browse the repository at this point in the history
… with no header) (#48350)

Backport PR #48334: BUG: read_html(extract_links=all) with no header

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and mroeschke authored Sep 2, 2022
1 parent f0b0630 commit e77e7c1
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 2 deletions.
7 changes: 5 additions & 2 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from pandas import isna
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.indexes.base import Index
from pandas.core.indexes.multi import MultiIndex

from pandas.io.common import (
file_exists,
Expand Down Expand Up @@ -1009,9 +1010,11 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **
try:
df = _data_to_frame(data=table, **kwargs)
# Cast MultiIndex header to an Index of tuples when extracting header
# links and replace nan with None.
# links and replace nan with None (therefore can't use mi.to_flat_index()).
# This maintains consistency of selection (e.g. df.columns.str[1])
if extract_links in ("all", "header"):
if extract_links in ("all", "header") and isinstance(
df.columns, MultiIndex
):
df.columns = Index(
((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
tupleize_cols=False,
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,3 +1416,18 @@ def test_extract_links_bad(self, spam_data):
)
with pytest.raises(ValueError, match=msg):
read_html(spam_data, extract_links="incorrect")

def test_extract_links_all_no_header(self):
# GH 48316
data = """
<table>
<tr>
<td>
<a href='https://google.com'>Google.com</a>
</td>
</tr>
</table>
"""
result = self.read_html(data, extract_links="all")[0]
expected = DataFrame([[("Google.com", "https://google.com")]])
tm.assert_frame_equal(result, expected)

0 comments on commit e77e7c1

Please sign in to comment.