scantist-ossops-m2 · cingmanwu · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
@@ -8,9 +8,10 @@
 import copy
 try:
     from urlparse import urlsplit
+    from urllib import unquote_plus
 except ImportError:
     # Python 3
-    from urllib.parse import urlsplit
+    from urllib.parse import urlsplit, unquote_plus
 from lxml import etree
 from lxml.html import defs
 from lxml.html import fromstring, XHTML_NAMESPACE
@@ -482,7 +483,7 @@ def _kill_elements(self, doc, condition, iterate=None):
 
     def _remove_javascript_link(self, link):
         # links like "j a v a s c r i p t:" might be interpreted in IE
-        new = _substitute_whitespace('', link)
+        new = _substitute_whitespace('', unquote_plus(link))
         if _is_javascript_scheme(new):
             # FIXME: should this be None to delete?
             return ''
@@ -509,6 +510,11 @@ def _has_sneaky_javascript(self, style):
             return True
         if 'expression(' in style:
             return True
+        if '@import' in style:
+            return True
+        if '</noscript' in style:
+            # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+            return True
         return False
 
     def clean_html(self, html):