nanos · nanos · Sep 2, 2024 · Aug 18, 2024 · Aug 18, 2024 · Aug 18, 2024
diff --git a/README.md b/README.md
@@ -151,6 +151,7 @@ Option | Required? | Notes |
 |:----------------------------------------------------|-----------|:------|
 |`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called  `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. To set tokens for multiple users using environment variables, define multiple environment variables with `FF_ACCESS_TOKEN` prefix, eg. `FF_ACCESS_TOKEN_USER1=…` and `FF_ACCESS_TOKEN_USER2=…`|
 |`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. |
+|`instance-blocklist` | No | A comma seperated list of instance domains that FediFetcher should never attempt to connect to. 
 |`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`.
 | `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope.
 | `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope.

diff --git a/find_posts.py b/find_posts.py
@@ -53,6 +53,7 @@
 argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.")
 argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)")
 argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format")
+argparser.add_argument('--instance-blocklist', required=False, type=str, default="",help="A comma-seperated array of instances that FediFetcher should never try to connect to")
 
 def get_notification_users(server, access_token, known_users, max_age):
     since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
@@ -1120,6 +1121,10 @@ def can_fetch(user_agent, url):
     parsed_uri = urlparse(url)
     robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
 
+    if parsed_uri.netloc in INSTANCE_BLOCKLIST:
+        # Never connect to these locations
+        raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist")
+
     robotsTxt = get_robots_from_url(robots_url)
     if isinstance(robotsTxt, bool):
         return robotsTxt
@@ -1501,7 +1506,8 @@ def fetch_timeline_context(timeline_posts, token, parsed_urls, seen_hosts, seen_
                 "on_done",
                 "on_fail",
                 "log_level",
-                "log_format"
+                "log_format",
+                "instance_blocklist"
             ]:
                 value = int(value)
             setattr(arguments, envvar, value)
@@ -1572,6 +1578,7 @@ def fetch_timeline_context(timeline_posts, token, parsed_urls, seen_hosts, seen_
         SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
         RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
 
+        INSTANCE_BLOCKLIST = [x.strip() for x in arguments.instance_blocklist.split(",")]
         ROBOTS_TXT = {}
 
         seen_urls = OrderedSet([])

diff --git a/tests/test_find_posts.py b/tests/test_find_posts.py
@@ -1446,6 +1446,7 @@ def test_can_fetch(mock_robotFileParser, mock_get_robots_from_url):
     # Prepare mocks
     mock_robotsTxt = MagicMock()
     mock_robotParser = MagicMock()
+    find_posts.INSTANCE_BLOCKLIST = []
 
     # Mock return values
     mock_get_robots_from_url.return_value = mock_robotsTxt