Skip to content

Commit

Permalink
Merge pull request #55 from guibacellar/40-finder-engine-inside-text-…
Browse files Browse the repository at this point in the history
…downloaded-files

40 finder engine inside text downloaded files
  • Loading branch information
guibacellar authored Nov 9, 2023
2 parents 821cbbc + 1cdab63 commit 83de3d1
Show file tree
Hide file tree
Showing 17 changed files with 395 additions and 164 deletions.
1 change: 1 addition & 0 deletions TEx/core/mapper/telethon_message_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ async def to_finder_notification_facade_entity(message: Message, downloaded_medi
message_id=message.id,
is_reply=message.is_reply,
downloaded_media_info=downloaded_media_info,
found_on='UNDEFINED',
)

return h_result
Expand Down
2 changes: 2 additions & 0 deletions TEx/finder/base_finder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""Base Class for All Finders."""
from __future__ import annotations

import abc


Expand Down
82 changes: 71 additions & 11 deletions TEx/finder/finder_engine.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
"""Finder Engine."""
from __future__ import annotations

from configparser import ConfigParser
from configparser import ConfigParser, SectionProxy
from typing import Dict, List, Optional

import aiofiles
import aiofiles.os

from TEx.finder.all_messages_finder import AllMessagesFinder
from TEx.finder.base_finder import BaseFinder
from TEx.finder.regex_finder import RegexFinder
from TEx.models.facade.finder_notification_facade_entity import FinderNotificationMessageEntity
from TEx.notifier.notifier_engine import NotifierEngine
Expand All @@ -18,12 +22,8 @@ def __init__(self) -> None:
self.is_finder_enabled: bool = False
self.rules: List[Dict] = []
self.notification_engine: NotifierEngine

def __is_finder_enabled(self, config: ConfigParser) -> bool:
"""Check if Finder Module is Enabled."""
return (
config.has_option('FINDER', 'enabled') and config['FINDER']['enabled'] == 'true'
)
self.find_in_text_enabled: bool = False
self.find_in_text_files_max_size_bytes: int = 0

def __load_rules(self, config: ConfigParser) -> None:
"""Load Finder Rules."""
Expand All @@ -35,18 +35,34 @@ def __load_rules(self, config: ConfigParser) -> None:
'id': sec,
'instance': RegexFinder(config=config[sec]),
'notifier': config[sec]['notifier'],
'type': config[sec]['type'],
})
elif config[sec]['type'] == 'all':
self.rules.append({
'id': sec,
'instance': AllMessagesFinder(config=config[sec]),
'notifier': config[sec]['notifier'],
'type': config[sec]['type'],
})

def configure(self, config: ConfigParser, notification_engine: NotifierEngine) -> None:
"""Configure Finder."""
self.is_finder_enabled = self.__is_finder_enabled(config=config)
self.__load_rules(config=config)
finder_config_proxy: Optional[SectionProxy] = config['FINDER'] if config.has_section('FINDER') else None

if finder_config_proxy:

# Get Basic Props
self.is_finder_enabled = finder_config_proxy.get('enabled', fallback='false') == 'true'
self.find_in_text_enabled = finder_config_proxy.get('find_in_text_files_enabled', fallback='false') == 'true'
self.find_in_text_files_max_size_bytes = int(finder_config_proxy.get('find_in_text_files_max_size_bytes', fallback='10000000'))

# Load all Rules
self.__load_rules(config=config)

else:
self.find_in_text_enabled = False

# Set Notification Engine
self.notification_engine = notification_engine

async def run(self, entity: Optional[FinderNotificationMessageEntity], source: str) -> None:
Expand All @@ -59,10 +75,29 @@ async def run(self, entity: Optional[FinderNotificationMessageEntity], source: s
if not self.is_finder_enabled or not entity:
return

cached_file_content: str = ''

for rule in self.rules:
is_found: bool = await rule['instance'].find(raw_text=entity.raw_text)

if is_found:
# Resolve Finder
finder: BaseFinder = rule['instance']

# Find in Raw Text Content
is_found_on_content: bool = await finder.find(raw_text=entity.raw_text)
is_found_on_text_downloaded_file: bool = False

# Find into Downloaded File (If Applicable)
if not is_found_on_content and self.find_in_text_enabled and rule['type'] != 'all':
is_found_on_text_downloaded_file = await self.__find_in_text_files(
entity=entity,
finder=finder,
file_content=cached_file_content,
)

if is_found_on_content or is_found_on_text_downloaded_file:

# Update found_on Flag
entity.found_on = 'MESSAGE' if is_found_on_content else f'FILE\n{entity.downloaded_media_info.disk_file_path}' # type: ignore

# Runt the Notification Engine
await self.notification_engine.run(
Expand All @@ -71,3 +106,28 @@ async def run(self, entity: Optional[FinderNotificationMessageEntity], source: s
rule_id=rule['id'],
source=source,
)

async def __find_in_text_files(self, entity: FinderNotificationMessageEntity, finder: BaseFinder, file_content: str) -> bool:
"""Try to Run the Finder Engine into the Downloaded Text File."""
if not entity.downloaded_media_info or not entity.downloaded_media_info.allow_search_in_text_file():
return False

# Check if File Exists
file_exists: bool = await aiofiles.os.path.exists(entity.downloaded_media_info.disk_file_path)
if not file_exists:
return False

# Check Max Size
max_size_exceeded: bool = entity.downloaded_media_info.size_bytes > self.find_in_text_files_max_size_bytes
if max_size_exceeded:
return False

# Open and Read the File
if file_content == '':
async with aiofiles.open(entity.downloaded_media_info.disk_file_path, 'rb') as f:
file_bytes = await f.read()
file_content = file_bytes.decode('UTF-8')
await f.close()

# Run Finder
return await finder.find(raw_text=file_content)
1 change: 1 addition & 0 deletions TEx/models/facade/finder_notification_facade_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ class FinderNotificationMessageEntity(BaseModel):
message_id: Optional[int]
is_reply: Optional[bool]
downloaded_media_info: Optional[MediaHandlingEntity]
found_on: str
14 changes: 14 additions & 0 deletions TEx/models/facade/media_handler_facade_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,17 @@ def is_video(self) -> bool:
"""Return if Downloaded Image are a Video."""
return self.content_type in ['application/ogg', 'video/mp4', 'video/quicktime', 'video/webm']

def allow_search_in_text_file(self) -> bool:
"""Return if Allow to Find in the Text File."""
return self.content_type in [
'application/atom+xml',
'application/bittorrent',
'application/csv',
'application/html',
'application/json',
'application/ld+json',
'text/csv',
'text/html',
'text/plain',
'text/xml',
]
1 change: 1 addition & 0 deletions TEx/notifier/discord_notifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ async def __get_finder_notification_embed(self, entity: FinderNotificationMessag
embed.add_embed_field(name='Group Name', value=entity.group_name if entity.group_name else '', inline=True)
embed.add_embed_field(name='Group ID', value=str(entity.group_id), inline=True)

embed.add_embed_field(name='Found On', value=entity.found_on, inline=False)
embed.add_embed_field(name='Message Date', value=str(entity.date_time), inline=False)
embed.add_embed_field(name='Tag', value=duplication_tag, inline=False)

Expand Down
1 change: 1 addition & 0 deletions TEx/notifier/elastic_search_notifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ async def __get_dict_for_finder_notification(self, entity: FinderNotificationMes
'reply_to_msg_id': entity.reply_to_msg_id,
'message_id': entity.message_id,
'is_reply': entity.is_reply,
'found_on': entity.found_on,
}

if entity.downloaded_media_info:
Expand Down
39 changes: 39 additions & 0 deletions docs/finder/configuration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Message Finder System

**Compatibility:** Message Listener Command

Telegram Explorer allows to specify many message finders. Usually, the finder engine looks at messages, but, they also can look at downloaded text files (plain, csv, xml, json, etc.).

**Configuration Spec:**

In order to use the finder engine, you must set a configuration to enable-it and configure if you want to allow the engine to find on files.

**Parameters:**

* **enabled** > Required - Enable(true)/Disable(false) the finder engine.
* **find_in_text_files_enabled** > Optional - Enable(true)/Disable(false) the behavior that run the finder engine inside the downloaded files.
* Default: false
* **find_in_text_files_max_size_bytes** > Optional - Set the max size in bytes of file that allow the engine to load the file in memory and perform the searches.
* Default: 10000000


**Changes on Configuration File**
```ini
[FINDER]
enabled=true
find_in_text_files_enabled=true
find_in_text_files_max_size_bytes=20000000
```

**Files Supported for the Engine:**

* application/atom+xml
* application/bittorrent
* application/csv
* application/html
* application/json
* application/ld+json
* text/csv
* text/html
* text/plain
* text/xml
Loading

0 comments on commit 83de3d1

Please sign in to comment.