Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for instance banlist #161

Merged
merged 5 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ Option | Required? | Notes |
|:----------------------------------------------------|-----------|:------|
|`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. To set tokens for multiple users using environment variables, define multiple environment variables with `FF_ACCESS_TOKEN` prefix, eg. `FF_ACCESS_TOKEN_USER1=…` and `FF_ACCESS_TOKEN_USER2=…`|
|`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. |
|`instance-blocklist` | No | A comma seperated list of instance domains that FediFetcher should never attempt to connect to.
|`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`.
| `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope.
| `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope.
Expand Down
9 changes: 8 additions & 1 deletion find_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.")
argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)")
argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format")
argparser.add_argument('--instance-blocklist', required=False, type=str, default="",help="A comma-seperated array of instances that FediFetcher should never try to connect to")

def get_notification_users(server, access_token, known_users, max_age):
since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
Expand Down Expand Up @@ -1120,6 +1121,10 @@ def can_fetch(user_agent, url):
parsed_uri = urlparse(url)
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)

if parsed_uri.netloc in INSTANCE_BLOCKLIST:
# Never connect to these locations
raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist")

robotsTxt = get_robots_from_url(robots_url)
if isinstance(robotsTxt, bool):
return robotsTxt
Expand Down Expand Up @@ -1501,7 +1506,8 @@ def fetch_timeline_context(timeline_posts, token, parsed_urls, seen_hosts, seen_
"on_done",
"on_fail",
"log_level",
"log_format"
"log_format",
"instance_blocklist"
]:
value = int(value)
setattr(arguments, envvar, value)
Expand Down Expand Up @@ -1572,6 +1578,7 @@ def fetch_timeline_context(timeline_posts, token, parsed_urls, seen_hosts, seen_
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')

INSTANCE_BLOCKLIST = [x.strip() for x in arguments.instance_blocklist.split(",")]
ROBOTS_TXT = {}

seen_urls = OrderedSet([])
Expand Down
1 change: 1 addition & 0 deletions tests/test_find_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -1446,6 +1446,7 @@ def test_can_fetch(mock_robotFileParser, mock_get_robots_from_url):
# Prepare mocks
mock_robotsTxt = MagicMock()
mock_robotParser = MagicMock()
find_posts.INSTANCE_BLOCKLIST = []

# Mock return values
mock_get_robots_from_url.return_value = mock_robotsTxt
Expand Down