Backport PR #48334 on branch 1.5.x (BUG: read_html(extract_links=all)…

… with no header) (#48350) Backport PR #48334: BUG: read_html(extract_links=all) with no header Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
pandas-dev · Sep 2, 2022 · e77e7c1 · e77e7c1
1 parent f0b0630
commit e77e7c1
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 2 deletions.
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -34,6 +34,7 @@
 from pandas import isna
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.indexes.base import Index
+from pandas.core.indexes.multi import MultiIndex
 
 from pandas.io.common import (
     file_exists,
@@ -1009,9 +1010,11 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **
         try:
             df = _data_to_frame(data=table, **kwargs)
             # Cast MultiIndex header to an Index of tuples when extracting header
-            # links and replace nan with None.
+            # links and replace nan with None (therefore can't use mi.to_flat_index()).
             # This maintains consistency of selection (e.g. df.columns.str[1])
-            if extract_links in ("all", "header"):
+            if extract_links in ("all", "header") and isinstance(
+                df.columns, MultiIndex
+            ):
                 df.columns = Index(
                     ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
                     tupleize_cols=False,

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1416,3 +1416,18 @@ def test_extract_links_bad(self, spam_data):
         )
         with pytest.raises(ValueError, match=msg):
             read_html(spam_data, extract_links="incorrect")
+
+    def test_extract_links_all_no_header(self):
+        # GH 48316
+        data = """
+        <table>
+          <tr>
+            <td>
+              <a href='https://google.com'>Google.com</a>
+            </td>
+          </tr>
+        </table>
+        """
+        result = self.read_html(data, extract_links="all")[0]
+        expected = DataFrame([[("Google.com", "https://google.com")]])
+        tm.assert_frame_equal(result, expected)