From e92eba10a3c67d96c3dd3d8f54e0c9854d5ac9ac Mon Sep 17 00:00:00 2001 From: Pavan-Microsoft Date: Thu, 2 Jan 2025 16:45:39 +0530 Subject: [PATCH] fix: Add Missing Logs (#1609) --- code/backend/batch/batch_push_results.py | 5 +++- .../helpers/azure_form_recognizer_helper.py | 8 +++++ .../utilities/helpers/config/config_helper.py | 8 ++++- .../integrated_vectorization_embedder.py | 11 +++++++ .../helpers/embedders/postgres_embedder.py | 13 +++++++++ .../helpers/embedders/push_embedder.py | 13 +++++++++ .../batch/utilities/helpers/env_helper.py | 1 + .../batch/utilities/helpers/llm_helper.py | 6 ++++ .../orchestrator/lang_chain_agent.py | 2 ++ .../orchestrator/open_ai_functions.py | 5 ++++ .../utilities/orchestrator/prompt_flow.py | 20 +++++++++++++ .../utilities/orchestrator/semantic_kernel.py | 2 ++ .../utilities/parser/output_parser_tool.py | 2 ++ .../utilities/search/azure_search_handler.py | 12 ++++++++ ...integrated_vectorization_search_handler.py | 29 +++++++++++++++++-- .../utilities/tools/content_safety_checker.py | 11 +++++++ .../utilities/tools/question_answer_tool.py | 13 +++++++++ code/backend/pages/01_Ingest_Data.py | 1 + code/backend/pages/02_Explore_Data.py | 3 ++ code/backend/pages/04_Configuration.py | 3 ++ code/create_app.py | 15 ++++++++++ 21 files changed, 179 insertions(+), 4 deletions(-) diff --git a/code/backend/batch/batch_push_results.py b/code/backend/batch/batch_push_results.py index 673e6a0b8..2e6538325 100644 --- a/code/backend/batch/batch_push_results.py +++ b/code/backend/batch/batch_push_results.py @@ -28,19 +28,22 @@ def _get_file_name_from_message(message_body) -> str: ) def batch_push_results(msg: func.QueueMessage) -> None: message_body = json.loads(msg.get_body().decode("utf-8")) - logger.debug("Process Document Event queue function triggered: %s", message_body) + logger.info("Process Document Event queue function triggered: %s", message_body) event_type = message_body.get("eventType", "") # We handle "" in this scenario for backwards compatibility # This function is primarily triggered by an Event Grid queue message from the blob storage # However, it can also be triggered using a legacy schema from BatchStartProcessing if event_type in ("", "Microsoft.Storage.BlobCreated"): + logger.info("Handling 'Blob Created' event with message body: %s", message_body) _process_document_created_event(message_body) elif event_type == "Microsoft.Storage.BlobDeleted": + logger.info("Handling 'Blob Deleted' event with message body: %s", message_body) _process_document_deleted_event(message_body) else: + logger.exception("Received an unrecognized event type: %s", event_type) raise NotImplementedError(f"Unknown event type received: {event_type}") diff --git a/code/backend/batch/utilities/helpers/azure_form_recognizer_helper.py b/code/backend/batch/utilities/helpers/azure_form_recognizer_helper.py index 22e0fa576..5abb54d15 100644 --- a/code/backend/batch/utilities/helpers/azure_form_recognizer_helper.py +++ b/code/backend/batch/utilities/helpers/azure_form_recognizer_helper.py @@ -1,3 +1,4 @@ +import logging from azure.core.credentials import AzureKeyCredential from azure.ai.formrecognizer import DocumentAnalysisClient from azure.identity import DefaultAzureCredential @@ -5,6 +6,8 @@ import traceback from .env_helper import EnvHelper +logger = logging.getLogger(__name__) + class AzureFormRecognizerClient: def __init__(self) -> None: @@ -75,6 +78,8 @@ def begin_analyze_document_from_url( model_id = "prebuilt-layout" if use_layout else "prebuilt-read" try: + logger.info("Method begin_analyze_document_from_url started") + logger.info(f"Model ID selected: {model_id}") poller = self.document_analysis_client.begin_analyze_document_from_url( model_id, document_url=source_url ) @@ -144,4 +149,7 @@ def begin_analyze_document_from_url( return page_map except Exception as e: + logger.exception(f"Exception in begin_analyze_document_from_url: {e}") raise ValueError(f"Error: {traceback.format_exc()}. Error: {e}") + finally: + logger.info("Method begin_analyze_document_from_url ended") diff --git a/code/backend/batch/utilities/helpers/config/config_helper.py b/code/backend/batch/utilities/helpers/config/config_helper.py index 93d86b80c..bc16287ce 100644 --- a/code/backend/batch/utilities/helpers/config/config_helper.py +++ b/code/backend/batch/utilities/helpers/config/config_helper.py @@ -190,21 +190,27 @@ def _set_new_config_properties(config: dict, default_config: dict): @staticmethod @functools.cache def get_active_config_or_default(): + logger.info("Method get_active_config_or_default started") env_helper = EnvHelper() config = ConfigHelper.get_default_config() if env_helper.LOAD_CONFIG_FROM_BLOB_STORAGE: + logger.info("Loading configuration from Blob Storage") blob_client = AzureBlobStorageClient(container_name=CONFIG_CONTAINER_NAME) if blob_client.file_exists(CONFIG_FILE_NAME): + logger.info("Configuration file found in Blob Storage") default_config = config config_file = blob_client.download_file(CONFIG_FILE_NAME) config = json.loads(config_file) ConfigHelper._set_new_config_properties(config, default_config) else: - logger.info("Returning default config") + logger.info( + "Configuration file not found in Blob Storage, using default configuration" + ) + logger.info("Method get_active_config_or_default ended") return Config(config) @staticmethod diff --git a/code/backend/batch/utilities/helpers/embedders/integrated_vectorization_embedder.py b/code/backend/batch/utilities/helpers/embedders/integrated_vectorization_embedder.py index 0e74a83e8..07f33a573 100644 --- a/code/backend/batch/utilities/helpers/embedders/integrated_vectorization_embedder.py +++ b/code/backend/batch/utilities/helpers/embedders/integrated_vectorization_embedder.py @@ -15,11 +15,16 @@ class IntegratedVectorizationEmbedder(EmbedderBase): def __init__(self, env_helper: EnvHelper): self.env_helper = env_helper self.llm_helper: LLMHelper = LLMHelper() + logger.info("Initialized IntegratedVectorizationEmbedder.") def embed_file(self, source_url: str, file_name: str = None): + logger.info( + f"Starting embed_file for source_url: {source_url}, file_name: {file_name}." + ) self.process_using_integrated_vectorization(source_url=source_url) def process_using_integrated_vectorization(self, source_url: str): + logger.info(f"Starting integrated vectorization for source_url: {source_url}.") config = ConfigHelper.get_active_config_or_default() try: search_datasource = AzureSearchDatasource(self.env_helper) @@ -35,14 +40,20 @@ def process_using_integrated_vectorization(self, source_url: str): self.env_helper.AZURE_SEARCH_INDEXER_NAME, skillset_name=search_skillset_result.name, ) + logger.info("Integrated vectorization process completed successfully.") return indexer_result except Exception as e: logger.error(f"Error processing {source_url}: {e}") raise e def reprocess_all(self): + logger.info("Starting reprocess_all operation.") search_indexer = AzureSearchIndexer(self.env_helper) if search_indexer.indexer_exists(self.env_helper.AZURE_SEARCH_INDEXER_NAME): + logger.info( + f"Running indexer: {self.env_helper.AZURE_SEARCH_INDEXER_NAME}." + ) search_indexer.run_indexer(self.env_helper.AZURE_SEARCH_INDEXER_NAME) else: + logger.info("Indexer does not exist. Starting full processing.") self.process_using_integrated_vectorization(source_url="all") diff --git a/code/backend/batch/utilities/helpers/embedders/postgres_embedder.py b/code/backend/batch/utilities/helpers/embedders/postgres_embedder.py index d81c9727c..5041485c0 100644 --- a/code/backend/batch/utilities/helpers/embedders/postgres_embedder.py +++ b/code/backend/batch/utilities/helpers/embedders/postgres_embedder.py @@ -20,6 +20,7 @@ class PostgresEmbedder(EmbedderBase): def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper): + logger.info("Initializing PostgresEmbedder.") self.env_helper = env_helper self.llm_helper = LLMHelper() self.azure_postgres_helper = AzurePostgresHelper() @@ -33,6 +34,7 @@ def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper): self.embedding_configs[ext] = processor def embed_file(self, source_url: str, file_name: str): + logger.info(f"Embedding file: {file_name} from source: {source_url}") file_extension = file_name.split(".")[-1].lower() embedding_config = self.embedding_configs.get(file_extension) self.__embed( @@ -48,32 +50,42 @@ def embed_file(self, source_url: str, file_name: str): def __embed( self, source_url: str, file_extension: str, embedding_config: EmbeddingConfig ): + logger.info(f"Starting embedding process for source: {source_url}") documents_to_upload: List[SourceDocument] = [] if ( embedding_config.use_advanced_image_processing and file_extension in self.config.get_advanced_image_processing_image_types() ): + logger.error( + "Advanced image processing is not supported in PostgresEmbedder." + ) raise NotImplementedError( "Advanced image processing is not supported in PostgresEmbedder." ) else: + logger.info(f"Loading documents from source: {source_url}") documents: List[SourceDocument] = self.document_loading.load( source_url, embedding_config.loading ) documents = self.document_chunking.chunk( documents, embedding_config.chunking ) + logger.info("Chunked into document chunks.") for document in documents: documents_to_upload.append(self.__convert_to_search_document(document)) if documents_to_upload: + logger.info( + f"Uploading {len(documents_to_upload)} documents to vector store." + ) self.azure_postgres_helper.create_vector_store(documents_to_upload) else: logger.warning("No documents to upload.") def __convert_to_search_document(self, document: SourceDocument): + logger.info(f"Generating embeddings for document ID: {document.id}") embedded_content = self.llm_helper.generate_embeddings(document.content) metadata = { "id": document.id, @@ -84,6 +96,7 @@ def __convert_to_search_document(self, document: SourceDocument): "offset": document.offset, "page_number": document.page_number, } + logger.info(f"Metadata generated for document ID: {document.id}") return { "id": document.id, "content": document.content, diff --git a/code/backend/batch/utilities/helpers/embedders/push_embedder.py b/code/backend/batch/utilities/helpers/embedders/push_embedder.py index a1cff59cc..460f4b41d 100644 --- a/code/backend/batch/utilities/helpers/embedders/push_embedder.py +++ b/code/backend/batch/utilities/helpers/embedders/push_embedder.py @@ -24,6 +24,7 @@ class PushEmbedder(EmbedderBase): def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper): + logger.info("Initializing PushEmbedder") self.env_helper = env_helper self.llm_helper = LLMHelper() self.azure_search_helper = AzureSearchHelper() @@ -33,11 +34,14 @@ def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper): self.blob_client = blob_client self.config = ConfigHelper.get_active_config_or_default() self.embedding_configs = {} + logger.info("Loading document processors") for processor in self.config.document_processors: ext = processor.document_type.lower() self.embedding_configs[ext] = processor + logger.info("Document processors loaded") def embed_file(self, source_url: str, file_name: str): + logger.info(f"Embedding file: {file_name} from URL: {source_url}") file_extension = file_name.split(".")[-1].lower() embedding_config = self.embedding_configs.get(file_extension) self.__embed( @@ -46,6 +50,7 @@ def embed_file(self, source_url: str, file_name: str): embedding_config=embedding_config, ) if file_extension != "url": + logger.info(f"Upserting blob metadata for file: {file_name}") self.blob_client.upsert_blob_metadata( file_name, {"embeddings_added": "true"} ) @@ -53,12 +58,14 @@ def embed_file(self, source_url: str, file_name: str): def __embed( self, source_url: str, file_extension: str, embedding_config: EmbeddingConfig ): + logger.info(f"Processing embedding for file extension: {file_extension}") documents_to_upload: List[SourceDocument] = [] if ( embedding_config.use_advanced_image_processing and file_extension in self.config.get_advanced_image_processing_image_types() ): + logger.info(f"Using advanced image processing for: {source_url}") caption = self.__generate_image_caption(source_url) caption_vector = self.llm_helper.generate_embeddings(caption) @@ -69,6 +76,7 @@ def __embed( ) ) else: + logger.info(f"Loading documents from source: {source_url}") documents: List[SourceDocument] = self.document_loading.load( source_url, embedding_config.loading ) @@ -81,6 +89,7 @@ def __embed( # Upload documents (which are chunks) to search index in batches if documents_to_upload: + logger.info("Uploading documents in batches") batch_size = self.env_helper.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE search_client = self.azure_search_helper.get_search_client() for i in range(0, len(documents_to_upload), batch_size): @@ -93,6 +102,7 @@ def __embed( logger.warning("No documents to upload.") def __generate_image_caption(self, source_url): + logger.info(f"Generating image caption for URL: {source_url}") model = self.env_helper.AZURE_OPENAI_VISION_MODEL caption_system_message = """You are an assistant that generates rich descriptions of images. You need to be accurate in the information you extract and detailed in the descriptons you generate. @@ -116,9 +126,11 @@ def __generate_image_caption(self, source_url): response = self.llm_helper.get_chat_completion(messages, model) caption = response.choices[0].message.content + logger.info("Caption generation completed") return caption def __convert_to_search_document(self, document: SourceDocument): + logger.info(f"Converting document ID {document.id} to search document format") embedded_content = self.llm_helper.generate_embeddings(document.content) metadata = { self.env_helper.AZURE_SEARCH_FIELDS_ID: document.id, @@ -151,6 +163,7 @@ def __create_image_document( content: str, content_vector: List[float], ): + logger.info(f"Creating image document for source URL: {source_url}") parsed_url = urlparse(source_url) file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path diff --git a/code/backend/batch/utilities/helpers/env_helper.py b/code/backend/batch/utilities/helpers/env_helper.py index 93a796488..3d3aaff3c 100644 --- a/code/backend/batch/utilities/helpers/env_helper.py +++ b/code/backend/batch/utilities/helpers/env_helper.py @@ -362,6 +362,7 @@ def __load_config(self, **kwargs) -> None: self.SEMENTIC_KERNEL_SYSTEM_PROMPT = os.getenv( "SEMENTIC_KERNEL_SYSTEM_PROMPT", "" ) + logger.info("Initializing EnvHelper completed") def is_chat_model(self): if "gpt-4" in self.AZURE_OPENAI_MODEL_NAME.lower(): diff --git a/code/backend/batch/utilities/helpers/llm_helper.py b/code/backend/batch/utilities/helpers/llm_helper.py index 7dfc58002..7517fb575 100644 --- a/code/backend/batch/utilities/helpers/llm_helper.py +++ b/code/backend/batch/utilities/helpers/llm_helper.py @@ -1,3 +1,4 @@ +import logging from openai import AzureOpenAI from typing import List, Union, cast from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings @@ -10,9 +11,12 @@ from azure.identity import DefaultAzureCredential from .env_helper import EnvHelper +logger = logging.getLogger(__name__) + class LLMHelper: def __init__(self): + logger.info("Initializing LLMHelper") self.env_helper: EnvHelper = EnvHelper() self.auth_type_keys = self.env_helper.is_auth_type_keys() self.token_provider = self.env_helper.AZURE_TOKEN_PROVIDER @@ -38,6 +42,8 @@ def __init__(self): ) self.embedding_model = self.env_helper.AZURE_OPENAI_EMBEDDING_MODEL + logger.info("Initializing LLMHelper completed") + def get_llm(self): if self.auth_type_keys: return AzureChatOpenAI( diff --git a/code/backend/batch/utilities/orchestrator/lang_chain_agent.py b/code/backend/batch/utilities/orchestrator/lang_chain_agent.py index e7a04af5b..358dc0495 100644 --- a/code/backend/batch/utilities/orchestrator/lang_chain_agent.py +++ b/code/backend/batch/utilities/orchestrator/lang_chain_agent.py @@ -56,6 +56,7 @@ async def orchestrate( self, user_message: str, chat_history: List[dict], **kwargs: dict ) -> list[dict]: + logger.info("Method orchestrate of lang_chain_agent started") # Call Content Safety tool if self.config.prompts.enable_content_safety: if response := self.call_content_safety_input(user_message): @@ -122,4 +123,5 @@ async def orchestrate( answer=answer.answer, source_documents=answer.source_documents, ) + logger.info("Method orchestrate of lang_chain_agent ended") return messages diff --git a/code/backend/batch/utilities/orchestrator/open_ai_functions.py b/code/backend/batch/utilities/orchestrator/open_ai_functions.py index b680c2ed3..59140a851 100644 --- a/code/backend/batch/utilities/orchestrator/open_ai_functions.py +++ b/code/backend/batch/utilities/orchestrator/open_ai_functions.py @@ -54,9 +54,12 @@ def __init__(self) -> None: async def orchestrate( self, user_message: str, chat_history: List[dict], **kwargs: dict ) -> list[dict]: + logger.info("Method orchestrate of open_ai_functions started") # Call Content Safety tool if self.config.prompts.enable_content_safety: + logger.info("Content Safety enabled. Checking input message...") if response := self.call_content_safety_input(user_message): + logger.info("Content Safety check returned a response. Exiting method.") return response # Call function to determine route @@ -143,6 +146,7 @@ async def orchestrate( answer = Answer(question=user_message, answer=text) if answer.answer is None: + logger.info("Answer is None") answer.answer = "The requested information is not available in the retrieved data. Please try another query or topic." # Call Content Safety tool @@ -156,4 +160,5 @@ async def orchestrate( answer=answer.answer, source_documents=answer.source_documents, ) + logger.info("Method orchestrate of open_ai_functions ended") return messages diff --git a/code/backend/batch/utilities/orchestrator/prompt_flow.py b/code/backend/batch/utilities/orchestrator/prompt_flow.py index 4f6cb85d2..e46b38f71 100644 --- a/code/backend/batch/utilities/orchestrator/prompt_flow.py +++ b/code/backend/batch/utilities/orchestrator/prompt_flow.py @@ -23,12 +23,17 @@ def __init__(self) -> None: self.enpoint_name = self.env_helper.PROMPT_FLOW_ENDPOINT_NAME self.deployment_name = self.env_helper.PROMPT_FLOW_DEPLOYMENT_NAME + logger.info("PromptFlowOrchestrator initialized.") + async def orchestrate( self, user_message: str, chat_history: List[dict], **kwargs: dict ) -> list[dict]: + logger.info("Orchestration started.") # Call Content Safety tool on question if self.config.prompts.enable_content_safety: + logger.info("Content safety check enabled for input.") if response := self.call_content_safety_input(user_message): + logger.info("Content safety flagged the input. Returning response.") return response transformed_chat_history = self.transform_chat_history(chat_history) @@ -36,14 +41,17 @@ async def orchestrate( file_name = self.transform_data_into_file( user_message, transformed_chat_history ) + logger.info(f"File created for Prompt Flow: {file_name}") # Call the Prompt Flow service try: + logger.info("Invoking Prompt Flow service.") response = self.ml_client.online_endpoints.invoke( endpoint_name=self.enpoint_name, request_file=file_name, deployment_name=self.deployment_name, ) + logger.info("Prompt Flow service invoked successfully.") result = json.loads(response) logger.debug(result) except Exception as error: @@ -51,6 +59,7 @@ async def orchestrate( raise RuntimeError(f"The request failed: {error}") from error # Transform response into answer for further processing + logger.info("Processing response from Prompt Flow.") answer = Answer( question=user_message, answer=result["chat_output"], @@ -58,21 +67,27 @@ async def orchestrate( result["citations"] ), ) + logger.info("Answer processed successfully.") # Call Content Safety tool on answer if self.config.prompts.enable_content_safety: + logger.info("Content safety check enabled for output.") if response := self.call_content_safety_output(user_message, answer.answer): + logger.info("Content safety flagged the output. Returning response.") return response # Format the output for the UI + logger.info("Formatting output for UI.") messages = self.output_parser.parse( question=answer.question, answer=answer.answer, source_documents=answer.source_documents, ) + logger.info("Orchestration completed successfully.") return messages def transform_chat_history(self, chat_history): + logger.info("Transforming chat history.") transformed_chat_history = [] for i, message in enumerate(chat_history): if message["role"] == "user": @@ -89,17 +104,21 @@ def transform_chat_history(self, chat_history): "outputs": {"chat_output": assistant_message}, } ) + logger.info("Chat history transformation completed.") return transformed_chat_history def transform_data_into_file(self, user_message, chat_history): # Transform data input into a file for the Prompt Flow service + logger.info("Creating temporary file for Prompt Flow input.") data = {"chat_input": user_message, "chat_history": chat_history} body = str.encode(json.dumps(data)) with tempfile.NamedTemporaryFile(delete=False) as file: file.write(body) + logger.info("Temporary file created") return file.name def transform_citations_into_source_documents(self, citations): + logger.info("Transforming citations into source documents.") source_documents = [] for _, doc_id in enumerate(citations): @@ -112,4 +131,5 @@ def transform_citations_into_source_documents(self, citations): chunk_id=str(citation.get("chunk_id", 0)), ) ) + logger.info("Citations transformation completed.") return source_documents diff --git a/code/backend/batch/utilities/orchestrator/semantic_kernel.py b/code/backend/batch/utilities/orchestrator/semantic_kernel.py index 0937a80b0..8cc743c0d 100644 --- a/code/backend/batch/utilities/orchestrator/semantic_kernel.py +++ b/code/backend/batch/utilities/orchestrator/semantic_kernel.py @@ -35,6 +35,7 @@ def __init__(self) -> None: async def orchestrate( self, user_message: str, chat_history: list[dict], **kwargs: dict ) -> list[dict]: + logger.info("Method orchestrate of semantic_kernel started") # Call Content Safety tool if self.config.prompts.enable_content_safety: if response := self.call_content_safety_input(user_message): @@ -143,4 +144,5 @@ async def orchestrate( answer=answer.answer, source_documents=answer.source_documents, ) + logger.info("Method orchestrate of semantic_kernel ended") return messages diff --git a/code/backend/batch/utilities/parser/output_parser_tool.py b/code/backend/batch/utilities/parser/output_parser_tool.py index ac326353f..ace176a83 100644 --- a/code/backend/batch/utilities/parser/output_parser_tool.py +++ b/code/backend/batch/utilities/parser/output_parser_tool.py @@ -34,6 +34,7 @@ def parse( source_documents: List[SourceDocument] = [], **kwargs: dict, ) -> List[dict]: + logger.info("Method parse of output_parser_tool started") answer = self._clean_up_answer(answer) doc_ids = self._get_source_docs_from_answer(answer) answer = self._make_doc_references_sequential(answer) @@ -87,4 +88,5 @@ def parse( messages.append({"role": "assistant", "content": answer, "end_turn": True}) # everything in content needs to be stringified to work with Azure BYOD frontend messages[0]["content"] = json.dumps(messages[0]["content"]) + logger.info("Method parse of output_parser_tool ended") return messages diff --git a/code/backend/batch/utilities/search/azure_search_handler.py b/code/backend/batch/utilities/search/azure_search_handler.py index 42e6e73c7..c7d385598 100644 --- a/code/backend/batch/utilities/search/azure_search_handler.py +++ b/code/backend/batch/utilities/search/azure_search_handler.py @@ -1,3 +1,4 @@ +import logging from typing import List from .search_handler_base import SearchHandlerBase @@ -9,6 +10,8 @@ from azure.search.documents.models import VectorizedQuery import tiktoken +logger = logging.getLogger(__name__) + class AzureSearchHandler(SearchHandlerBase): _ENCODER_NAME = "cl100k_base" @@ -27,13 +30,16 @@ def perform_search(self, filename): ) def process_results(self, results): + logger.info("Processing search results") if results is None: + logger.warning("No results found") return [] data = [ # Note that images uploaded with advanced image processing do not have a chunk ID [json.loads(result["metadata"]).get("chunk", i), result["content"]] for i, result in enumerate(results) ] + logger.info("Processed results") return data def get_files(self): @@ -73,25 +79,31 @@ def search_by_blob_url(self, blob_url): ) def query_search(self, question) -> List[SourceDocument]: + logger.info(f"Performing query search for question: {question}") encoding = tiktoken.get_encoding(self._ENCODER_NAME) tokenised_question = encoding.encode(question) if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING: + logger.info("Using advanced image processing for vectorization") vectorized_question = self.azure_computer_vision_client.vectorize_text( question ) else: + logger.info("Skipping advanced image processing") vectorized_question = None if self.env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH: + logger.info("Performing semantic search") results = self._semantic_search( question, tokenised_question, vectorized_question ) else: + logger.info("Performing hybrid search") results = self._hybrid_search( question, tokenised_question, vectorized_question ) + logger.info("Converting search results to SourceDocument list") return self._convert_to_source_documents(results) def _semantic_search( diff --git a/code/backend/batch/utilities/search/integrated_vectorization_search_handler.py b/code/backend/batch/utilities/search/integrated_vectorization_search_handler.py index 4f6dbb9b3..d9470a6a0 100644 --- a/code/backend/batch/utilities/search/integrated_vectorization_search_handler.py +++ b/code/backend/batch/utilities/search/integrated_vectorization_search_handler.py @@ -1,3 +1,4 @@ +import logging from typing import List from .search_handler_base import SearchHandlerBase from azure.search.documents import SearchClient @@ -11,7 +12,9 @@ class IntegratedVectorizationSearchHandler(SearchHandlerBase): def create_search_client(self): + logging.info("Creating Azure Search Client.") if self._check_index_exists(): + logging.info("Search index exists. Returning Search Client.") return SearchClient( endpoint=self.env_helper.AZURE_SEARCH_SERVICE, index_name=self.env_helper.AZURE_SEARCH_INDEX, @@ -23,6 +26,7 @@ def create_search_client(self): ) def perform_search(self, filename): + logging.info(f"Performing search for file: {filename}.") if self._check_index_exists(): return self.search_client.search( search_text="*", @@ -31,21 +35,26 @@ def perform_search(self, filename): ) def process_results(self, results): + logging.info("Processing search results.") if results is None: + logging.warning("No results found to process.") return [] data = [ [re.findall(r"\d+", result["chunk_id"])[-1], result["content"]] for result in results ] + logging.info(f"Processed {len(data)} results.") return data def get_files(self): + logging.info("Fetching files from search index.") if self._check_index_exists(): return self.search_client.search( "*", select="id, chunk_id, title", include_total_count=True ) def output_results(self, results): + logging.info("Organizing search results into output format.") files = {} for result in results: id = result["chunk_id"] @@ -54,10 +63,10 @@ def output_results(self, results): files[filename].append(id) else: files[filename] = [id] - return files def search_by_blob_url(self, blob_url: str): + logging.info(f"Searching by blob URL: {blob_url}.") if self._check_index_exists(): title = blob_url.split(f"{self.env_helper.AZURE_BLOB_CONTAINER_NAME}/")[1] return self.search_client.search( @@ -68,6 +77,7 @@ def search_by_blob_url(self, blob_url: str): ) def delete_files(self, files): + logging.info("Deleting files.") ids_to_delete = [] files_to_delete = [] @@ -77,17 +87,24 @@ def delete_files(self, files): self.search_client.delete_documents(ids_to_delete) + logging.info(f"Deleted files: {', '.join(files_to_delete)}.") return ", ".join(files_to_delete) def query_search(self, question) -> List[SourceDocument]: + logging.info(f"Querying search for question: {question}.") if self._check_index_exists(): + logging.info("Search index exists. Proceeding with search.") if self.env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH: + logging.info("Using semantic search.") search_results = self._semantic_search(question) else: + logging.info("Using hybrid search.") search_results = self._hybrid_search(question) + logging.info("Search completed. Converting results to SourceDocuments.") return self._convert_to_source_documents(search_results) def _hybrid_search(self, question: str): + logging.info(f"Performing hybrid search for question: {question}.") vector_query = VectorizableTextQuery( text=question, k_nearest_neighbors=self.env_helper.AZURE_SEARCH_TOP_K, @@ -101,6 +118,7 @@ def _hybrid_search(self, question: str): ) def _semantic_search(self, question: str): + logging.info(f"Performing semantic search for question: {question}.") vector_query = VectorizableTextQuery( text=question, k_nearest_neighbors=self.env_helper.AZURE_SEARCH_TOP_K, @@ -119,6 +137,7 @@ def _semantic_search(self, question: str): ) def _convert_to_source_documents(self, search_results) -> List[SourceDocument]: + logging.info("Converting search results to SourceDocument objects.") source_documents = [] for source in search_results: source_documents.append( @@ -130,18 +149,22 @@ def _convert_to_source_documents(self, search_results) -> List[SourceDocument]: chunk_id=source.get("chunk_id"), ) ) + logging.info("Converted SourceDocument objects.") return source_documents def _extract_source_url(self, original_source: str) -> str: + logging.info("Extracting source URL.") matches = list(re.finditer(r"https?://", original_source)) if len(matches) > 1: second_http_start = matches[1].start() source_url = original_source[second_http_start:] else: source_url = original_source + "_SAS_TOKEN_PLACEHOLDER_" + logging.info(f"Extracted source URL: {source_url}.") return source_url def _check_index_exists(self) -> bool: + logging.info("Checking if search index exists.") search_index_client = SearchIndexClient( endpoint=self.env_helper.AZURE_SEARCH_SERVICE, credential=( @@ -151,6 +174,8 @@ def _check_index_exists(self) -> bool: ), ) - return self.env_helper.AZURE_SEARCH_INDEX in [ + exists = self.env_helper.AZURE_SEARCH_INDEX in [ name for name in search_index_client.list_index_names() ] + logging.info(f"Search index exists: {exists}.") + return exists diff --git a/code/backend/batch/utilities/tools/content_safety_checker.py b/code/backend/batch/utilities/tools/content_safety_checker.py index d04c77f23..efba3a4c4 100644 --- a/code/backend/batch/utilities/tools/content_safety_checker.py +++ b/code/backend/batch/utilities/tools/content_safety_checker.py @@ -16,22 +16,28 @@ def __init__(self): env_helper = EnvHelper() if env_helper.AZURE_AUTH_TYPE == "rbac": + logger.info("Initializing ContentSafetyClient with RBAC authentication.") self.content_safety_client = ContentSafetyClient( env_helper.AZURE_CONTENT_SAFETY_ENDPOINT, DefaultAzureCredential(), ) else: + logger.info( + "Initializing ContentSafetyClient with AzureKeyCredential authentication." + ) self.content_safety_client = ContentSafetyClient( env_helper.AZURE_CONTENT_SAFETY_ENDPOINT, AzureKeyCredential(env_helper.AZURE_CONTENT_SAFETY_KEY), ) def process_answer(self, answer: Answer, **kwargs: dict) -> Answer: + logger.info("Processing answer.") response_template = kwargs["response_template"] answer.answer = self._filter_text_and_replace(answer.answer, response_template) return answer def validate_input_and_replace_if_harmful(self, text): + logger.info("Validating input text for harmful content") response_template = f'{"Unfortunately, I am not able to process your question, as I have detected sensitive content that I am not allowed to process. This might be a mistake, so please try rephrasing your question."}' return self.process_answer( Answer(question="", answer=text, source_documents=[]), @@ -39,6 +45,7 @@ def validate_input_and_replace_if_harmful(self, text): ).answer def validate_output_and_replace_if_harmful(self, text): + logger.info("Validating output text for harmful content") response_template = f'{"Unfortunately, I have detected sensitive content in my answer, which I am not allowed to show you. This might be a mistake, so please try again and maybe rephrase your question."}' return self.process_answer( Answer(question="", answer=text, source_documents=[]), @@ -46,6 +53,7 @@ def validate_output_and_replace_if_harmful(self, text): ).answer def _filter_text_and_replace(self, text, response_template): + logger.info("Analyzing text for harmful content") request = AnalyzeTextOptions(text=text) try: response = self.content_safety_client.analyze_text(request) @@ -64,6 +72,9 @@ def _filter_text_and_replace(self, text, response_template): # filtered_text = response_template for result in response.categories_analysis: if result.severity > 0: + logger.warning( + f"Harmful content detected: Severity: {result.severity}. Replacing text." + ) filtered_text = response_template return filtered_text diff --git a/code/backend/batch/utilities/tools/question_answer_tool.py b/code/backend/batch/utilities/tools/question_answer_tool.py index 6c944d943..fb6d09791 100644 --- a/code/backend/batch/utilities/tools/question_answer_tool.py +++ b/code/backend/batch/utilities/tools/question_answer_tool.py @@ -24,6 +24,7 @@ def __init__(self) -> None: self.verbose = True self.config = ConfigHelper.get_active_config_or_default() + logger.info("QuestionAnswerTool initialized with configuration.") @staticmethod def json_remove_whitespace(obj: str) -> str: @@ -33,6 +34,7 @@ def json_remove_whitespace(obj: str) -> str: try: return json.dumps(json.loads(obj), separators=(",", ":")) except json.JSONDecodeError: + logger.exception("Failed to parse JSON in json_remove_whitespace.") return obj @staticmethod @@ -50,6 +52,9 @@ def generate_messages(self, question: str, sources: list[SourceDocument]): [f"[doc{i+1}]: {source.content}" for i, source in enumerate(sources)] ) + logger.info( + f"Generating messages for question: {question} with {len(sources)} sources." + ) return [ { "content": self.config.prompts.answering_user_prompt.format( @@ -68,6 +73,7 @@ def generate_on_your_data_messages( ) -> list[dict]: examples = [] + logger.info(f"Generating On Your Data messages for question: {question}") few_shot_example = { "sources": self.config.example.documents.strip(), "question": self.config.example.user_question.strip(), @@ -148,10 +154,14 @@ def generate_on_your_data_messages( ] def answer_question(self, question: str, chat_history: list[dict], **kwargs): + logger.info("Answering question") source_documents = Search.get_source_documents(self.search_handler, question) if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING: image_urls = self.create_image_url_list(source_documents) + logger.info( + f"Generated {len(image_urls)} image URLs for advanced image processing." + ) else: image_urls = [] @@ -188,6 +198,9 @@ def create_image_url_list(self, source_documents): if doc.title is not None and doc.title.split(".")[-1] in image_types ][: self.env_helper.ADVANCED_IMAGE_PROCESSING_MAX_IMAGES] + logger.info( + f"Generated {len(image_urls)} image URLs for {len(source_documents)} source documents." + ) return image_urls def format_answer_from_response( diff --git a/code/backend/pages/01_Ingest_Data.py b/code/backend/pages/01_Ingest_Data.py index 8f572a719..f5fd97b44 100644 --- a/code/backend/pages/01_Ingest_Data.py +++ b/code/backend/pages/01_Ingest_Data.py @@ -49,6 +49,7 @@ def reprocess_all(): else: st.error(f"Error: {response.text}") except Exception: + logger.error(traceback.format_exc()) st.error(traceback.format_exc()) diff --git a/code/backend/pages/02_Explore_Data.py b/code/backend/pages/02_Explore_Data.py index 0d71ed47b..6dc0a9efd 100644 --- a/code/backend/pages/02_Explore_Data.py +++ b/code/backend/pages/02_Explore_Data.py @@ -1,3 +1,4 @@ +import logging import streamlit as st import os import traceback @@ -9,6 +10,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..")) env_helper: EnvHelper = EnvHelper() +logger = logging.getLogger(__name__) st.set_page_config( page_title="Explore Data", @@ -62,4 +64,5 @@ def load_css(file_path): except Exception: + logger.error(traceback.format_exc()) st.error(traceback.format_exc()) diff --git a/code/backend/pages/04_Configuration.py b/code/backend/pages/04_Configuration.py index c41d17aa5..6e2c9c2bb 100644 --- a/code/backend/pages/04_Configuration.py +++ b/code/backend/pages/04_Configuration.py @@ -1,3 +1,4 @@ +import logging import os import sys import json @@ -12,6 +13,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..")) env_helper: EnvHelper = EnvHelper() +logger = logging.getLogger(__name__) st.set_page_config( page_title="Configure Prompts", @@ -517,4 +519,5 @@ def validate_documents(): del st.session_state["reset_configuration"] except Exception as e: + logger.error(f"Error occurred: {e}") st.error(e) diff --git a/code/create_app.py b/code/create_app.py index 8f81cb27e..c272387f7 100644 --- a/code/create_app.py +++ b/code/create_app.py @@ -43,6 +43,7 @@ def get_markdown_url(source, title, container_sas): def get_citations(citation_list): """Returns Formated Citations.""" + logger.info("Method get_citations started") blob_client = AzureBlobStorageClient() container_sas = blob_client.get_container_sas() citations_dict = {"citations": []} @@ -68,6 +69,7 @@ def get_citations(citation_list): "url": url, } ) + logger.info("Method get_citations ended") return citations_dict @@ -139,13 +141,16 @@ def stream_with_data(response: Stream[ChatCompletionChunk]): def conversation_with_data(conversation: Request, env_helper: EnvHelper): """This function streams the response from Azure OpenAI with data.""" + logger.info("Method conversation_with_data started") if env_helper.is_auth_type_keys(): + logger.info("Using key-based authentication for Azure OpenAI") openai_client = AzureOpenAI( azure_endpoint=env_helper.AZURE_OPENAI_ENDPOINT, api_version=env_helper.AZURE_OPENAI_API_VERSION, api_key=env_helper.AZURE_OPENAI_API_KEY, ) else: + logger.info("Using RBAC authentication for Azure OpenAI") openai_client = AzureOpenAI( azure_endpoint=env_helper.AZURE_OPENAI_ENDPOINT, api_version=env_helper.AZURE_OPENAI_API_VERSION, @@ -265,6 +270,7 @@ def conversation_with_data(conversation: Request, env_helper: EnvHelper): return response_obj + logger.info("Method conversation_with_data ended") return Response(stream_with_data(response), mimetype="application/json-lines") @@ -409,6 +415,7 @@ def health(): return "OK" def conversation_azure_byod(): + logger.info("Method conversation_azure_byod started") try: if should_use_data(env_helper, azure_search_helper): return conversation_with_data(request, env_helper) @@ -427,11 +434,14 @@ def conversation_azure_byod(): error_message = str(e) logger.exception("Exception in /api/conversation | %s", error_message) return jsonify({"error": ERROR_GENERIC_MESSAGE}), 500 + finally: + logger.info("Method conversation_azure_byod ended") async def conversation_custom(): message_orchestrator = get_message_orchestrator() try: + logger.info("Method conversation_custom started") user_message = request.json["messages"][-1]["content"] conversation_id = request.json["conversation_id"] user_assistant_messages = list( @@ -471,6 +481,8 @@ async def conversation_custom(): error_message = str(e) logger.exception("Exception in /api/conversation | %s", error_message) return jsonify({"error": ERROR_GENERIC_MESSAGE}), 500 + finally: + logger.info("Method conversation_custom ended") @app.route("/api/conversation", methods=["POST"]) async def conversation(): @@ -495,6 +507,7 @@ async def conversation(): def speech_config(): """Get the speech config for Azure Speech.""" try: + logger.info("Method speech_config started") speech_key = env_helper.AZURE_SPEECH_KEY or get_speech_key(env_helper) response = requests.post( @@ -519,6 +532,8 @@ def speech_config(): logger.exception("Exception in /api/speech | %s", str(e)) return {"error": "Failed to get speech config"}, 500 + finally: + logger.info("Method speech_config ended") @app.route("/api/assistanttype", methods=["GET"]) def assistanttype():