Skip to content

Commit

Permalink
[Issue 3497] add allowed list to filter through attachments (#3544)
Browse files Browse the repository at this point in the history
## Summary
Fixes [#{3497}](#3497)

### Time to review: __5 mins__

## Changes proposed
Added Filter to only files with the allowed suffixes (case-insensitive)
Update test
  • Loading branch information
babebe authored Jan 17, 2025
1 parent b4ccc44 commit dca1e2c
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 25 deletions.
36 changes: 20 additions & 16 deletions api/src/search/backend/load_opportunities_to_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from enum import StrEnum
from typing import Iterator, Sequence

import smart_open
from opensearchpy.exceptions import ConnectionTimeout, TransportError
from pydantic import Field
from pydantic_settings import SettingsConfigDict
Expand All @@ -22,11 +21,16 @@
OpportunitySearchIndexQueue,
)
from src.task.task import Task
from src.util import file_util
from src.util.datetime_util import get_now_us_eastern_datetime
from src.util.env_config import PydanticBaseEnvConfig

logger = logging.getLogger(__name__)

ALLOWED_ATTACHMENT_SUFFIXES = set(
["txt", "pdf", "docx", "doc", "xlsx", "xlsm", "html", "htm", "pptx", "ppt", "rtf"]
)


class LoadOpportunitiesToIndexConfig(PydanticBaseEnvConfig):
model_config = SettingsConfigDict(env_prefix="LOAD_OPP_SEARCH_")
Expand Down Expand Up @@ -275,28 +279,28 @@ def fetch_existing_opportunity_ids_in_index(self) -> set[int]:

return opportunity_ids

def filter_attachments(
self, attachments: list[OpportunityAttachment]
) -> list[OpportunityAttachment]:
return [attachment for attachment in attachments]
def filter_attachment(self, attachment: OpportunityAttachment) -> bool:
file_suffix = attachment.file_name.lower().split(".")[-1]
return file_suffix in ALLOWED_ATTACHMENT_SUFFIXES

def get_attachment_json_for_opportunity(
self, opp_attachments: list[OpportunityAttachment]
) -> list[dict]:

attachments = []
for att in opp_attachments:
with smart_open.open(
att.file_location,
"rb",
) as file:
file_content = file.read()
attachments.append(
{
"filename": att.file_name,
"data": base64.b64encode(file_content).decode("utf-8"),
}
)
if self.filter_attachment(att):
with file_util.open_stream(
att.file_location,
"rb",
) as file:
file_content = file.read()
attachments.append(
{
"filename": att.file_name,
"data": base64.b64encode(file_content).decode("utf-8"),
}
)

return attachments

Expand Down
30 changes: 21 additions & 9 deletions api/tests/src/search/backend/test_load_opportunities_to_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,27 @@ def test_opportunity_attachment_pipeline(
opportunity_index_alias,
search_client,
):
filename = "test_file_1.txt"
file_path = f"s3://{mock_s3_bucket}/{filename}"
filename_1 = "test_file_1.txt"
file_path_1 = f"s3://{mock_s3_bucket}/{filename_1}"
content = "I am a file"
with file_util.open_stream(file_path, "w") as outfile:

with file_util.open_stream(file_path_1, "w") as outfile:
outfile.write(content)

filename_2 = "test_file_2.css"
file_path_2 = f"s3://{mock_s3_bucket}/{filename_2}"

opportunity = OpportunityFactory.create(opportunity_attachments=[])
OpportunityAttachmentFactory.create(
mime_type="text/plain",
opportunity=opportunity,
file_location=file_path,
file_name=filename,
file_location=file_path_1,
file_name=filename_1,
)

OpportunityAttachmentFactory.create(
opportunity=opportunity,
file_location=file_path_2,
file_name=filename_2,
)

load_opportunities_to_index.index_name = (
Expand All @@ -172,11 +181,14 @@ def test_opportunity_attachment_pipeline(
resp = search_client.search(opportunity_index_alias, {"size": 100})

record = [d for d in resp.records if d.get("opportunity_id") == opportunity.opportunity_id]
attachment = record[0]["attachments"][0]
attachments = record[0]["attachments"]

# assert only one (allowed) opportunity attachment was uploaded
assert len(attachments) == 1
# assert correct attachment was uploaded
assert attachment["filename"] == filename
assert attachments[0]["filename"] == filename_1
# assert data was b64encoded
assert attachment["attachment"]["content"] == content # decoded b64encoded attachment
assert attachments[0]["attachment"]["content"] == content # decoded b64encoded attachment


class TestLoadOpportunitiesToIndexPartialRefresh(BaseTestClass):
Expand Down

0 comments on commit dca1e2c

Please sign in to comment.