Skip to content

Commit

Permalink
url: fix .title vs url callback plugins
Browse files Browse the repository at this point in the history
Also a bunch of misc cleaning
  • Loading branch information
half-duplex committed May 16, 2022
1 parent cea42e1 commit e629693
Showing 1 changed file with 89 additions and 92 deletions.
181 changes: 89 additions & 92 deletions sopel/modules/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@
"""
from __future__ import annotations

import ipaddress
from ipaddress import ip_address
import logging
import re
from typing import Generator, List, Optional, Tuple
from urllib.parse import urlparse

import dns.resolver
import requests
from urllib3.exceptions import LocationValueError # type: ignore[import]

from sopel import plugin, tools
from sopel.config import types
from sopel.bot import Sopel
from sopel.config import Config, types
from sopel.tools import web
from sopel.trigger import Trigger


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -62,22 +65,18 @@ class UrlSection(types.StaticSection):
"""If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters."""
enable_private_resolution = types.BooleanAttribute(
'enable_private_resolution', default=False)
"""Enable URL lookups for RFC1918 addresses"""
enable_dns_resolution = types.BooleanAttribute(
'enable_dns_resolution', default=False)
"""Enable DNS resolution for all domains to validate if there are RFC1918 resolutions"""
"""Enable requests to private and local network IP addresses"""


def configure(config):
def configure(config: Config):
"""
| name | example | purpose |
| ---- | ------- | ------- |
| enable_auto_title | yes | Enable auto-title. |
| exclude | https?://git\\\\.io/.* | A list of regular expressions for URLs for which the title should not be shown. |
| exclusion\\_char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. |
| shorten\\_url\\_length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. |
| enable\\_private\\_resolution | False | Enable URL lookups for RFC1918 addresses. |
| enable\\_dns\\_resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. |
| enable\\_private\\_resolution | False | Enable requests to private and local network IP addresses. |
"""
config.define_section('url', UrlSection)
config.url.configure_setting(
Expand All @@ -100,15 +99,11 @@ def configure(config):
)
config.url.configure_setting(
'enable_private_resolution',
'Enable URL lookups for RFC1918 addresses?'
)
config.url.configure_setting(
'enable_dns_resolution',
'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?'
'Enable requests to private and local network IP addresses?'
)


def setup(bot):
def setup(bot: Sopel):
bot.config.define_section('url', UrlSection)

if bot.config.url.exclude:
Expand Down Expand Up @@ -139,7 +134,7 @@ def setup(bot):
bot.memory['shortened_urls'] = tools.SopelMemory()


def shutdown(bot):
def shutdown(bot: Sopel):
# Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`;
# clearing `shortened_urls` will increase API calls. Leaving it in memory
# should not lead to unexpected behavior.
Expand All @@ -154,7 +149,7 @@ def shutdown(bot):
@plugin.example('.urlpexclude example\\.com/\\w+', user_help=True)
@plugin.example('.urlexclude example.com/path', user_help=True)
@plugin.output_prefix('[url] ')
def url_ban(bot, trigger):
def url_ban(bot: Sopel, trigger: Trigger):
"""Exclude a URL from auto title.
Use ``urlpexclude`` to exclude a pattern instead of a URL.
Expand Down Expand Up @@ -199,7 +194,7 @@ def url_ban(bot, trigger):
@plugin.example('.urlpallow example\\.com/\\w+', user_help=True)
@plugin.example('.urlallow example.com/path', user_help=True)
@plugin.output_prefix('[url] ')
def url_unban(bot, trigger):
def url_unban(bot: Sopel, trigger: Trigger):
"""Allow a URL for auto title.
Use ``urlpallow`` to allow a pattern instead of a URL.
Expand Down Expand Up @@ -246,30 +241,27 @@ def url_unban(bot, trigger):
'Google | www.google.com',
online=True, vcr=True)
@plugin.output_prefix('[url] ')
def title_command(bot, trigger):
def title_command(bot: Sopel, trigger: Trigger):
"""
Show the title or URL information for the given URL, or the last URL seen
in this channel.
"""
result_count = 0

if not trigger.group(2):
if trigger.sender not in bot.memory['last_seen_url']:
return
matched = check_callbacks(
bot, bot.memory['last_seen_url'][trigger.sender])
if matched:
return
else:
urls = [bot.memory['last_seen_url'][trigger.sender]]
urls = [bot.memory["last_seen_url"][trigger.sender]]
else:
urls = list( # needs to be a list so len() can be checked later
web.search_urls(
trigger,
exclusion_char=bot.config.url.exclusion_char
)
)
# needs to be a list so len() can be checked later
urls = list(web.search_urls(trigger))

result_count = 0
for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
for url, title, domain, tinyurl, dispatched in process_urls(
bot, trigger, urls, requested=True
):
if dispatched:
result_count += 1
continue
message = '%s | %s' % (title, domain)
if tinyurl:
message += ' ( %s )' % tinyurl
Expand All @@ -289,7 +281,7 @@ def title_command(bot, trigger):

@plugin.rule(r'(?u).*(https?://\S+).*')
@plugin.output_prefix('[url] ')
def title_auto(bot, trigger):
def title_auto(bot: Sopel, trigger: Trigger):
"""
Automatically show titles for URLs. For shortened URLs/redirects, find
where the URL redirects to and show the title for that (or call a function
Expand All @@ -311,55 +303,68 @@ def title_auto(bot, trigger):
urls = web.search_urls(
trigger, exclusion_char=bot.config.url.exclusion_char, clean=True)

for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
message = '%s | %s' % (title, domain)
if tinyurl:
message += ' ( %s )' % tinyurl
# Guard against responding to other instances of this bot.
if message != trigger:
bot.say(message)
bot.memory['last_seen_url'][trigger.sender] = url
for url, title, domain, tinyurl, dispatched in process_urls(bot, trigger, urls):
if not dispatched:
message = '%s | %s' % (title, domain)
if tinyurl:
message += ' ( %s )' % tinyurl
# Guard against responding to other instances of this bot.
if message != trigger:
bot.say(message)
bot.memory["last_seen_url"][trigger.sender] = url


def process_urls(bot, trigger, urls):
def process_urls(
bot: Sopel, trigger: Trigger, urls: List[str], requested: bool = False
) -> Generator[Tuple[str, str, Optional[str], Optional[str], bool], None, None]:
"""
For each URL in the list, ensure that it isn't handled by another plugin.
If not, find where it redirects to, if anywhere. If that redirected URL
should be handled by another plugin, dispatch the callback for it.
Return a list of (title, hostname) tuples for each URL which is not handled
by another plugin.
For each URL in the list, ensure it should be titled, and do so.
See if it's handled by another plugin. If not, find where it redirects to,
if anywhere. If that redirected URL should be handled by another plugin,
dispatch the callback for it. Return a list of
(url, title, hostname, tinyurl, dispatched) tuples for each URL.
If a callback was dispatched, only the url and dispatched=True will be set.
For titles explicitly requested by the user, exclusion_char and excludes
are skipped.
:param bot: Sopel instance
:param trigger: The trigger object for this event
:param urls: The URLs detected in the triggering message
:param requested: Whether the title was explicitly requested (vs automatic)
"""
shorten_url_length = bot.config.url.shorten_url_length
for url in urls:
# Exclude URLs that start with the exclusion char
if url.startswith(bot.config.url.exclusion_char):
if not requested and url.startswith(bot.config.url.exclusion_char):
continue

parsed_url = urlparse(url)

# Check the URL does not match an existing URL callback
if check_callbacks(bot, url):
continue
if check_callbacks(bot, url, use_excludes=not requested):
yield (url, None, None, None, True)
return

# Prevent private addresses from being queried if enable_private_resolution is False
# FIXME: This does nothing when an attacker knows how to host a 302
# FIXME: This whole concept has a TOCTOU issue
if not bot.config.url.enable_private_resolution:
parsed = urlparse(url)
# Check if it's an address like http://192.168.1.1
try:
if ipaddress.ip_address(parsed.hostname).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
LOGGER.debug('Ignoring private URL: %s', url)
continue
ips = [ip_address(parsed_url.hostname)]
except ValueError:
pass

# Check if domains are RFC1918 addresses if enable_dns_resolutions is set
if bot.config.url.enable_dns_resolution:
private = False
for result in dns.resolver.query(parsed.hostname):
if ipaddress.ip_address(result).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
private = True
break
if private:
LOGGER.debug('Ignoring private URL: %s', url)
continue
ips = [ip_address(ip) for ip in dns.resolver.query(parsed_url.hostname)]

private = False
for ip in ips:
if ip.is_private or ip.is_loopback:
private = True
break
if private:
LOGGER.debug('Ignoring private URL: %s', url)
continue

# Call the URL to get a title, if possible
title = find_title(url)
Expand All @@ -373,14 +378,15 @@ def process_urls(bot, trigger, urls):
if (shorten_url_length > 0) and (len(url) > shorten_url_length):
tinyurl = get_or_create_shorturl(bot, url)

yield (url, title, get_hostname(url), tinyurl)
yield (url, title, parsed_url.hostname, tinyurl, False)


def check_callbacks(bot, url):
def check_callbacks(bot: Sopel, url: str, use_excludes: bool = True) -> bool:
"""Check if ``url`` is excluded or matches any URL callback patterns.
:param bot: Sopel instance
:param str url: URL to check
:param url: URL to check
:param use_excludes: Use or ignore the configured exclusion lists
:return: True if ``url`` is excluded or matches any URL callback pattern
This function looks at the ``bot.memory`` for ``url_exclude`` patterns and
Expand All @@ -400,16 +406,21 @@ def check_callbacks(bot, url):
"""
# Check if it matches the exclusion list first
matched = any(regex.search(url) for regex in bot.memory['url_exclude'])
excluded = False
if use_excludes:
excluded = any(regex.search(url) for regex in bot.memory["url_exclude"])
return (
matched or
excluded or
any(bot.search_url_callbacks(url)) or
bot.rules.check_url_callback(bot, url)
)


def find_title(url, verify=True):
"""Return the title for the given URL."""
def find_title(url: str, verify: bool = True) -> Optional[str]:
"""Return the title for the given URL.
:param verify: Whether to require a valid certificate when using https
"""
try:
response = requests.get(url, stream=True, verify=verify,
headers=DEFAULT_HEADERS)
Expand Down Expand Up @@ -453,26 +464,12 @@ def find_title(url, verify=True):
return title or None


def get_hostname(url):
idx = 7
if url.startswith('https://'):
idx = 8
elif url.startswith('ftp://'):
idx = 6
hostname = url[idx:]
slash = hostname.find('/')
if slash != -1:
hostname = hostname[:slash]
return hostname


def get_or_create_shorturl(bot, url):
def get_or_create_shorturl(bot: Sopel, url: str) -> str:
"""Get or create a short URL for ``url``
:param bot: Sopel instance
:param str url: URL to get or create a short URL for
:param url: URL to get or create a short URL for
:return: A short URL
:rtype: str
It gets the short URL for ``url`` from the bot's memory if it exists.
Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it
Expand All @@ -488,7 +485,7 @@ def get_or_create_shorturl(bot, url):
return tinyurl


def get_tinyurl(url):
def get_tinyurl(url: str) -> Optional[str]:
"""Returns a shortened tinyURL link of the URL"""
base_url = "https://tinyurl.com/api-create.php"
tinyurl = "%s?%s" % (base_url, web.urlencode({'url': url}))
Expand Down

0 comments on commit e629693

Please sign in to comment.