diff --git a/.gitignore b/.gitignore index 7e86cf0..5a008fa 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,4 @@ dmypy.json # Cython debug symbols cython_debug/ +.idea/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/git_toolbox_prj.xml b/.idea/git_toolbox_prj.xml new file mode 100644 index 0000000..b382006 --- /dev/null +++ b/.idea/git_toolbox_prj.xml @@ -0,0 +1,15 @@ + + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 9d9d7f9..57e018c 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ *The application is under migrating to the Flask framework. Not all functionalities are available now.* -# Welcome to the SearchMyData-2.0 App! The web version of [the SearchMyData App](https://github.com/AMProduction/SearchMyData) +# Welcome to the SearchMyData-2.0 App! The web version of [the SearchMyData App](https://github.com/AMProduction/SearchMyData/wiki) ## Description The app gives the possibility to perform a search into the Ukrainian government [Open data portal](https://data.gov.ua/en/) datasets. -At this moment (October 2021) integrated [Information on missing citizens](https://data.gov.ua/en/dataset/470196d3-4e7a-46b0-8c0c-883b74ac65f0), [Information about people hiding from the authorities](https://data.gov.ua/en/dataset/7c51c4a0-104b-4540-a166-e9fc58485c1b), [Unified register of debtors](https://data.gov.ua/dataset/506734bf-2480-448c-a2b4-90b6d06df11e) and [Unified State Register of Legal Entities, Individual Entrepreneurs and Public Associations](https://data.gov.ua/dataset/1c7f3815-3259-45e0-bdf1-64dca07ddc10). +At this moment (November 2021) integrated [Information on missing citizens](https://data.gov.ua/en/dataset/470196d3-4e7a-46b0-8c0c-883b74ac65f0), +[Information about people hiding from the authorities](https://data.gov.ua/en/dataset/7c51c4a0-104b-4540-a166-e9fc58485c1b), +[Unified register of debtors](https://data.gov.ua/dataset/506734bf-2480-448c-a2b4-90b6d06df11e), +[Unified State Register of Legal Entities, Individual Entrepreneurs and Public Associations](https://data.gov.ua/dataset/1c7f3815-3259-45e0-bdf1-64dca07ddc10) +and [Integrated Unified State Register of Lustrated Persons](https://data.gov.ua/dataset/8faa71c1-3a54-45e8-8f6e-06c92b1ff8bc). ## Hardware requirements * 16Gb+ RAM * SSD -## See additional info into [the SearchMyData-2.0 App wiki](https://github.com/AMProduction/SearchMyData/wiki) \ No newline at end of file +## See additional info into [the SearchMyData-2.0 App wiki](https://github.com/AMProduction/SearchMyData-2.0/wiki) \ No newline at end of file diff --git a/changelog.md b/changelog.md new file mode 100644 index 0000000..d32892c --- /dev/null +++ b/changelog.md @@ -0,0 +1,2 @@ +### v2.0. First release. XX/XX/2021 +* ??? \ No newline at end of file diff --git a/flaskr/__init__.py b/flaskr/__init__.py index 8c8ed2b..cbeda2c 100644 --- a/flaskr/__init__.py +++ b/flaskr/__init__.py @@ -24,12 +24,14 @@ def get_search_results(): from .src.LegalEntitiesRegister import LegalEntitiesRegister from .src.MissingPersonsRegister import MissingPersonsRegister from .src.WantedPersonsRegister import WantedPersonsRegister + from .src.LustratedPersonsRegister import LustratedPersonsRegister # create instances missing_persons = MissingPersonsRegister() wanted_persons = WantedPersonsRegister() debtors = DebtorsRegister() legal_entities = LegalEntitiesRegister() entrepreneurs = EntrepreneursRegister() + lustrated = LustratedPersonsRegister() if request.method == 'POST': search_string = request.form['search'] # call search methods @@ -38,8 +40,10 @@ def get_search_results(): result_debtors = debtors.search_into_collection(search_string) result_legal_entities = legal_entities.search_into_collection(search_string) result_entrepreneurs = entrepreneurs.search_into_collection(search_string) + result_lustrated = lustrated.search_into_collection(search_string) return render_template('result.html', resultMissingPersons=result_missing_persons, resultWantedPersons=result_wanted_persons, resultDebtors=result_debtors, - resultLegalEntities=result_legal_entities, resultEntrepreneurs=result_entrepreneurs) + resultLegalEntities=result_legal_entities, resultEntrepreneurs=result_entrepreneurs, + resultLustrated=result_lustrated) return app diff --git a/flaskr/src/DebtorsRegister.py b/flaskr/src/DebtorsRegister.py index 37d6842..6af8828 100644 --- a/flaskr/src/DebtorsRegister.py +++ b/flaskr/src/DebtorsRegister.py @@ -1,17 +1,16 @@ import gc import json import logging +import mmap import os import shutil import zipfile -import mmap -import requests from datetime import datetime from io import BytesIO -from pymongo.errors import PyMongoError - +import requests from dask import dataframe as dd +from pymongo.errors import PyMongoError from .dataset import Dataset @@ -71,7 +70,7 @@ def __save_dataset(self, zip_url): # convert CSV to JSON using Dask debtors_csv.to_json('debtorsJson') for file in os.listdir('debtorsJson'): - file_object = open('debtorsJson/'+file, mode='r') + file_object = open('debtorsJson/' + file, mode='r') # map the entire file into memory, size 0 means whole file, normally much faster than buffered i/o mm = mmap.mmap(file_object.fileno(), 0, access=mmap.ACCESS_READ) # iterate over the block, until next newline @@ -93,10 +92,11 @@ def __save_dataset(self, zip_url): @Dataset.measure_execution_time def __clear_collection(self): - debtors_col = self.db['Debtors'] - count_deleted_documents = debtors_col.delete_many({}) - logging.warning('%s documents deleted. The wanted persons collection is empty.', str( - count_deleted_documents.deleted_count)) + if self.is_collection_exists('Debtors'): + debtors_col = self.db['Debtors'] + count_deleted_documents = debtors_col.delete_many({}) + logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The wanted persons ' + f'collection is empty.') @Dataset.measure_execution_time def __create_service_json(self): @@ -126,9 +126,9 @@ def __update_service_json(self): @Dataset.measure_execution_time def __update_metadata(self): - collections_list = self.db.list_collection_names() # update or create DebtorsRegisterServiceJson - if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 3}, limit=1) != 0): + if (self.is_collection_exists('ServiceCollection')) and ( + self.service_col.count_documents({'_id': 3}, limit=1) != 0): self.__update_service_json() logging.info('DebtorsRegisterServiceJson updated') else: @@ -137,10 +137,11 @@ def __update_metadata(self): @Dataset.measure_execution_time def __delete_collection_index(self): - debtors_col = self.db['Debtors'] - if 'full_text' in debtors_col.index_information(): - debtors_col.drop_index('full_text') - logging.warning('Debtors Text index deleted') + if self.is_collection_exists('Debtors'): + debtors_col = self.db['Debtors'] + if 'full_text' in debtors_col.index_information(): + debtors_col.drop_index('full_text') + logging.warning('Debtors Text index deleted') @Dataset.measure_execution_time def __create_collection_index(self): @@ -153,16 +154,16 @@ def search_into_collection(self, query_string): debtors_col = self.db['Debtors'] final_result = 0 try: - resultCount = debtors_col.count_documents({'$text': {'$search': query_string}}) + result_count = debtors_col.count_documents({'$text': {'$search': query_string}}) except PyMongoError: logging.error('Error during search into Debtors Register') else: - if resultCount == 0: + if result_count == 0: logging.warning('The debtors register: No data found') final_result = 0 else: - logging.warning('The debtors register: %s records found', str(resultCount)) - final_result = debtors_col.find({'$text': {'$search': query_string}}, {'score': {'$meta': 'textScore'}})\ + logging.warning(f'The debtors register: {result_count} records found') + final_result = debtors_col.find({'$text': {'$search': query_string}}, {'score': {'$meta': 'textScore'}}) \ .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True) gc.collect() return final_result diff --git a/flaskr/src/EntrepreneursRegister.py b/flaskr/src/EntrepreneursRegister.py index 4a280a2..c5bc3b0 100644 --- a/flaskr/src/EntrepreneursRegister.py +++ b/flaskr/src/EntrepreneursRegister.py @@ -1,8 +1,8 @@ import gc import logging from datetime import datetime -from pymongo.errors import PyMongoError +from pymongo.errors import PyMongoError from .dataset import Dataset @@ -21,10 +21,11 @@ def save_dataset(self): @Dataset.measure_execution_time def clear_collection(self): - entrepreneurs_col = self.db['Entrepreneurs'] - count_deleted_documents = entrepreneurs_col.delete_many({}) - logging.warning('%s documents deleted. The entrepreneurs collection is empty.', str( - count_deleted_documents.deleted_count)) + if self.is_collection_exists('Entrepreneurs'): + entrepreneurs_col = self.db['Entrepreneurs'] + count_deleted_documents = entrepreneurs_col.delete_many({}) + logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The entrepreneurs collection ' + f'is empty.') @Dataset.measure_execution_time def __create_service_json(self): @@ -54,9 +55,9 @@ def __update_service_json(self): @Dataset.measure_execution_time def update_metadata(self): - collections_list = self.db.list_collection_names() # update or create EntrepreneursRegisterServiceJson - if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 5}, limit=1) != 0): + if (self.is_collection_exists('ServiceCollection')) and ( + self.service_col.count_documents({'_id': 5}, limit=1) != 0): self.__update_service_json() logging.info('EntrepreneursRegisterServiceJson updated') else: @@ -65,10 +66,11 @@ def update_metadata(self): @Dataset.measure_execution_time def delete_collection_index(self): - entrepreneurs_col = self.db['Entrepreneurs'] - if 'full_text' in entrepreneurs_col.index_information(): - entrepreneurs_col.drop_index('full_text') - logging.warning('Entrepreneurs Text index deleted') + if self.is_collection_exists('Entrepreneurs'): + entrepreneurs_col = self.db['Entrepreneurs'] + if 'full_text' in entrepreneurs_col.index_information(): + entrepreneurs_col.drop_index('full_text') + logging.warning('Entrepreneurs Text index deleted') @Dataset.measure_execution_time def create_collection_index(self): @@ -81,17 +83,17 @@ def search_into_collection(self, query_string): entrepreneurs_col = self.db['Entrepreneurs'] final_result = 0 try: - resultCount = entrepreneurs_col.count_documents({'$text': {'$search': query_string}}) + result_count = entrepreneurs_col.count_documents({'$text': {'$search': query_string}}) except PyMongoError: logging.error('Error during search into Entrepreneurs Register') else: - if resultCount == 0: + if result_count == 0: logging.warning('The Entrepreneurs register: No data found') final_result = 0 else: - logging.warning('The Entrepreneurs register: %s records found', str(resultCount)) + logging.warning(f'The Entrepreneurs register: {result_count} records found') final_result = entrepreneurs_col.find({'$text': {'$search': query_string}}, - {'score': {'$meta': 'textScore'}})\ - .sort([('score',{'$meta': 'textScore'})]).allow_disk_use(True) + {'score': {'$meta': 'textScore'}}) \ + .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True) gc.collect() return final_result diff --git a/flaskr/src/LegalEntitiesRegister.py b/flaskr/src/LegalEntitiesRegister.py index 8e6c74a..2d2245a 100644 --- a/flaskr/src/LegalEntitiesRegister.py +++ b/flaskr/src/LegalEntitiesRegister.py @@ -5,11 +5,11 @@ import shutil import xml.etree.ElementTree as ET import zipfile -import requests from datetime import datetime from io import BytesIO -from pymongo.errors import PyMongoError +import requests +from pymongo.errors import PyMongoError from .dataset import Dataset @@ -65,7 +65,7 @@ def save_dataset(self, zip_url): logging.warning('File in ZIP: ' + str(xml_file)) # unzip all files entrepreneurs_zip.extractall('Temp') - for xml_file in os.listdir('Temp/'+root_folder_name): + for xml_file in os.listdir('Temp/' + root_folder_name): if xml_file.find('_UO_') != -1: # read the legal Entities Xml file path_to_file = 'Temp/' + root_folder_name + xml_file @@ -81,14 +81,14 @@ def save_dataset(self, zip_url): kved = record.find('KVED').text boss = record.find('BOSS').text beneficiaries_dict = {} - beneficiaryNumber = 1 + beneficiary_number = 1 for beneficiaries in record.iter('BENEFICIARIES'): if beneficiaries.find('BENEFICIARY') is not None: for beneficiary in beneficiaries.iter('BENEFICIARY'): beneficiary_to_dict = beneficiary.text - key = 'beneficiary' + str(beneficiaryNumber) + key = 'beneficiary' + str(beneficiary_number) beneficiaries_dict[key] = beneficiary_to_dict - beneficiaryNumber += 1 + beneficiary_number += 1 founders_dict = {} founders_number = 1 for founders in record.iter('FOUNDERS'): @@ -147,10 +147,11 @@ def save_dataset(self, zip_url): @Dataset.measure_execution_time def clear_collection(self): - legal_entities_col = self.db['LegalEntities'] - count_deleted_documents = legal_entities_col.delete_many({}) - logging.warning('%s documents deleted. The legal entities collection is empty.', str( - count_deleted_documents.deleted_count)) + if self.is_collection_exists('LegalEntities'): + legal_entities_col = self.db['LegalEntities'] + count_deleted_documents = legal_entities_col.delete_many({}) + logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The legal entities ' + f'collection is empty.') @Dataset.measure_execution_time def __create_service_json(self): @@ -180,9 +181,9 @@ def __update_service_json(self): @Dataset.measure_execution_time def update_metadata(self): - collections_list = self.db.list_collection_names() # update or create LegalEntitiesRegisterServiceJson - if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 4}, limit=1) != 0): + if (self.is_collection_exists('ServiceCollection')) and ( + self.service_col.count_documents({'_id': 4}, limit=1) != 0): self.__update_service_json() logging.info('LegalEntitiesRegisterServiceJson updated') else: @@ -191,10 +192,11 @@ def update_metadata(self): @Dataset.measure_execution_time def delete_collection_index(self): - legal_entities_col = self.db['LegalEntities'] - if 'full_text' in legal_entities_col.index_information(): - legal_entities_col.drop_index('full_text') - logging.warning('LegalEntities Text index deleted') + if self.is_collection_exists('LegalEntities'): + legal_entities_col = self.db['LegalEntities'] + if 'full_text' in legal_entities_col.index_information(): + legal_entities_col.drop_index('full_text') + logging.warning('LegalEntities Text index deleted') @Dataset.measure_execution_time def create_collection_index(self): @@ -216,9 +218,9 @@ def search_into_collection(self, query_string): logging.warning('The legal entities register: No data found') final_result = 0 else: - logging.warning('The legal entities register: %s records found', str(result_count)) + logging.warning(f'The legal entities register: {result_count} records found') final_result = legal_entities_col.find({'$text': {'$search': query_string}}, - {'score': {'$meta': 'textScore'}})\ + {'score': {'$meta': 'textScore'}}) \ .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True) gc.collect() return final_result diff --git a/flaskr/src/LustratedPersonsRegister.py b/flaskr/src/LustratedPersonsRegister.py new file mode 100644 index 0000000..03a424a --- /dev/null +++ b/flaskr/src/LustratedPersonsRegister.py @@ -0,0 +1,184 @@ +import gc +import json +import logging +import os +import shutil +import xml.etree.ElementTree as ET +import zipfile +from datetime import datetime +from io import BytesIO + +import requests +from pymongo.errors import PyMongoError + +from .dataset import Dataset + + +class LustratedPersonsRegister(Dataset): + def __init__(self): + super().__init__() + + @Dataset.measure_execution_time + def __get_dataset(self): + try: + general_dataset = requests.get( + 'https://data.gov.ua/api/3/action/package_show?id=8faa71c1-3a54-45e8-8f6e-06c92b1ff8bc').text + except ConnectionError: + logging.error('Error during general LustratedPersonsRegister dataset JSON receiving occurred') + else: + general_dataset_json = json.loads(general_dataset) + logging.info('A general LustratedPersonsRegister dataset JSON received') + # get dataset id + lustrated_persons_general_dataset_id = general_dataset_json['result']['resources'][0]['id'] + try: + # get resources JSON id + lustrated_persons_general_dataset_id_json = requests.get( + 'https://data.gov.ua/api/3/action/resource_show?id=' + lustrated_persons_general_dataset_id).text + except ConnectionError: + logging.error('Error during LustratedPersonsRegister resources JSON id receiving occurred') + else: + lustrated_persons_general_dataset_json = json.loads(lustrated_persons_general_dataset_id_json) + logging.info('A LustratedPersonsRegister resources JSON id received') + # get ZIP url + lustrated_persons_dataset_zip_url = lustrated_persons_general_dataset_json['result']['url'] + return lustrated_persons_dataset_zip_url + + @Dataset.measure_execution_time + def __save_dataset(self, zip_url): + lustrated_col = self.db['Lustrated'] + try: + # get ZIP file + lustrated_dataset_zip = requests.get(zip_url).content + except OSError: + logging.error('Error during LustratedPersonsRegister ZIP receiving occurred') + else: + logging.info('A LustratedPersonsRegister dataset received') + # get lists of files + lustrated_zip = zipfile.ZipFile(BytesIO(lustrated_dataset_zip), 'r') + # go inside ZIP + root_folder_name = '' + for xml_file in lustrated_zip.namelist(): + # skip root folder + if xml_file.endswith('/'): + root_folder_name = xml_file + continue + logging.warning('File in ZIP: ' + str(xml_file)) + # unzip + lustrated_zip.extractall('Temp') + lustrated_zip.close() + for xml_file in os.listdir('Temp/' + root_folder_name): + # read the lustrated persons Xml file + path_to_file = 'Temp/' + root_folder_name + xml_file + # parse xml + lustrated_json = {} + tree = ET.parse(path_to_file) + xml_data = tree.getroot() + for record in xml_data: + fio = record.find('FIO').text + job = record.find('JOB').text + judgment_composition = record.find('JUDGMENT_COMPOSITION').text + period = record.find('PERIOD').text + lustrated_json = { + 'fio': fio, + 'job': job, + 'judgment_composition': judgment_composition, + 'period': period + } + try: + # save to the collection + lustrated_col.insert_one(lustrated_json) + except PyMongoError: + logging.error('Error during saving Lustrated Persons Register into Database') + logging.info('Lustrated Persons dataset was saved into the database') + finally: + # delete temp files + shutil.rmtree('Temp', ignore_errors=True) + gc.collect() + + @Dataset.measure_execution_time + def __clear_collection(self): + if self.is_collection_exists('Lustrated'): + lustrated_col = self.db['Lustrated'] + count_deleted_documents = lustrated_col.delete_many({}) + logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The Lustrated Persons ' + f'collection is empty.') + + @Dataset.measure_execution_time + def __create_service_json(self): + created_date = datetime.now() + last_modified_date = datetime.now() + lustrated_col = self.db['Lustrated'] + documents_count = lustrated_col.count_documents({}) + lustrated_register_service_json = { + '_id': 6, + 'Description': 'Єдиний державний реєстр осіб, щодо яких застосовано положення Закону України «Про очищення влади»', + 'DocumentsCount': documents_count, + 'CreatedDate': str(created_date), + 'LastModifiedDate': str(last_modified_date) + } + self.service_col.insert_one(lustrated_register_service_json) + + @Dataset.measure_execution_time + def __update_service_json(self): + last_modified_date = datetime.now() + lustrated_col = self.db['Lustrated'] + documents_count = lustrated_col.count_documents({}) + self.service_col.update_one( + {'_id': 6}, + {'$set': {'LastModifiedDate': str(last_modified_date), + 'DocumentsCount': documents_count}} + ) + + @Dataset.measure_execution_time + def __update_metadata(self): + # update or create LustratedPersonsRegisterServiceJson + if (self.is_collection_exists('ServiceCollection')) and ( + self.service_col.count_documents({'_id': 6}, limit=1) != 0): + self.__update_service_json() + logging.info('LustratedPersonsRegisterServiceJson updated') + else: + self.__create_service_json() + logging.info('LustratedPersonsRegisterServiceJson created') + + @Dataset.measure_execution_time + def __delete_collection_index(self): + if self.is_collection_exists('Lustrated'): + lustrated_col = self.db['Lustrated'] + if 'full_text' in lustrated_col.index_information(): + lustrated_col.drop_index('full_text') + logging.warning('Lustrated Text index deleted') + + @Dataset.measure_execution_time + def __create_collection_index(self): + lustrated_col = self.db['Lustrated'] + lustrated_col.create_index([('fio', 'text')], name='full_text') + logging.info('Lustrated Text Index created') + + @Dataset.measure_execution_time + def search_into_collection(self, query_string): + lustrated_col = self.db['Lustrated'] + final_result = 0 + try: + result_count = lustrated_col.count_documents({'$text': {'$search': query_string}}) + except PyMongoError: + logging.error('Error during search into The Lustrated Persons Register') + else: + if result_count == 0: + logging.warning('The Lustrated Persons register: No data found') + final_result = 0 + else: + logging.warning(f'The Lustrated Persons register: {result_count} records found') + final_result = lustrated_col.find({'$text': {'$search': query_string}}, + {'score': {'$meta': 'textScore'}}) \ + .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True) + gc.collect() + return final_result + + @Dataset.measure_execution_time + def setup_dataset(self): + self.__delete_collection_index() + self.__clear_collection() + __lustrated_dataset_zip_url = self.__get_dataset() + self.__save_dataset(__lustrated_dataset_zip_url) + self.__update_metadata() + self.__create_collection_index() diff --git a/flaskr/src/MissingPersonsRegister.py b/flaskr/src/MissingPersonsRegister.py index ab3432d..4a3bedd 100644 --- a/flaskr/src/MissingPersonsRegister.py +++ b/flaskr/src/MissingPersonsRegister.py @@ -1,10 +1,10 @@ import gc import json import logging -import requests from datetime import datetime -from pymongo.errors import PyMongoError +import requests +from pymongo.errors import PyMongoError from .dataset import Dataset @@ -59,10 +59,11 @@ def __save_dataset(self, json): @Dataset.measure_execution_time def __clear_collection(self): - missing_persons_col = self.db['MissingPersons'] - count_deleted_documents = missing_persons_col.delete_many({}) - logging.warning('%s documents deleted. The missing persons collection is empty.', str( - count_deleted_documents.deleted_count)) + if self.is_collection_exists('MissingPersons'): + missing_persons_col = self.db['MissingPersons'] + count_deleted_documents = missing_persons_col.delete_many({}) + logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The missing persons ' + f'collection is empty.') @Dataset.measure_execution_time def __create_service_json(self): @@ -92,9 +93,9 @@ def __update_service_json(self): @Dataset.measure_execution_time def __update_metadata(self): - collections_list = self.db.list_collection_names() # update or create MissingPersonsRegisterServiceJson - if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 1}, limit=1) != 0): + if (self.is_collection_exists('ServiceCollection')) and ( + self.service_col.count_documents({'_id': 1}, limit=1) != 0): self.__update_service_json() logging.info('MissingPersonsRegisterServiceJson updated') else: @@ -103,10 +104,11 @@ def __update_metadata(self): @Dataset.measure_execution_time def __delete_collection_index(self): - missing_persons_col = self.db['MissingPersons'] - if 'full_text' in missing_persons_col.index_information(): - missing_persons_col.drop_index('full_text') - logging.warning('Missing persons Text index deleted') + if self.is_collection_exists('MissingPersons'): + missing_persons_col = self.db['MissingPersons'] + if 'full_text' in missing_persons_col.index_information(): + missing_persons_col.drop_index('full_text') + logging.warning('Missing persons Text index deleted') @Dataset.measure_execution_time def __create_collection_index(self): @@ -128,10 +130,10 @@ def search_into_collection(self, query_string): logging.warning('The missing persons register: No data found') final_result = 0 else: - logging.warning('The missing persons register: %s records found', str(result_count)) + logging.warning(f'The missing persons register: {result_count} records found') final_result = missing_persons_col.find({'$text': {'$search': query_string}}, - {'score': {'$meta': 'textScore'}})\ - .sort([('score', {'$meta': 'textScore'})]) + {'score': {'$meta': 'textScore'}}) \ + .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True) gc.collect() return final_result diff --git a/flaskr/src/ServiceTools.py b/flaskr/src/ServiceTools.py index 14d8adc..18cea3d 100644 --- a/flaskr/src/ServiceTools.py +++ b/flaskr/src/ServiceTools.py @@ -1,15 +1,23 @@ import json import logging -import os -from pathlib import Path from datetime import datetime, timedelta +from pathlib import Path import pymongo -from prettytable import PrettyTable from pymongo.errors import ServerSelectionTimeoutError class ServiceTools: + """Class for service purpose + + -------- + Methods: + -------- + get_registers_info(): + Show the list of available registers, count of documents, and the last update date + check_is_expired(): + Check if a dataset last updated date is older than 2 days ago + """ def __init__(self): self.logger = logging.getLogger(__name__) @@ -48,16 +56,20 @@ def __init__(self): f'{self.__class__.__name__}: Config.json is not found') def get_registers_info(self): - result = self.__serviceCol.find({}, {'_id': 1, 'Description': 1, 'DocumentsCount': 1, 'LastModifiedDate': 1})\ + """Show the list of available registers, count of documents, and the last update date + """ + result = self.__serviceCol.find({}, {'_id': 1, 'Description': 1, 'DocumentsCount': 1, 'LastModifiedDate': 1}) \ .sort([('_id', 1)]) return result def check_is_expired(self): + """Check if a dataset last updated date is older than 2 days ago + """ is_expired = False expired_time = datetime.now() - timedelta(days=2) for record in self.__serviceCol.find(): last_modified_date = datetime.strptime(record['LastModifiedDate'], '%Y-%m-%d %H:%M:%S.%f') if last_modified_date < expired_time: - logging.warning(record['Description'] + ' is out of date') + logging.warning(f'{record["Description"]} is out of date') is_expired = True return is_expired diff --git a/flaskr/src/WantedPersonsRegister.py b/flaskr/src/WantedPersonsRegister.py index 1a151b5..04a8f43 100644 --- a/flaskr/src/WantedPersonsRegister.py +++ b/flaskr/src/WantedPersonsRegister.py @@ -1,10 +1,10 @@ import gc import json import logging -import requests from datetime import datetime -from pymongo.errors import PyMongoError +import requests +from pymongo.errors import PyMongoError from .dataset import Dataset @@ -59,10 +59,11 @@ def __save_dataset(self, json): @Dataset.measure_execution_time def __clear_collection(self): - wanted_persons_col = self.db['WantedPersons'] - count_deleted_documents = wanted_persons_col.delete_many({}) - logging.warning('%s documents deleted. The wanted persons collection is empty.', str( - count_deleted_documents.deleted_count)) + if self.is_collection_exists('WantedPersons'): + wanted_persons_col = self.db['WantedPersons'] + count_deleted_documents = wanted_persons_col.delete_many({}) + logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The wanted persons ' + f'collection is empty.') @Dataset.measure_execution_time def __create_service_json(self): @@ -92,9 +93,9 @@ def __update_service_json(self): @Dataset.measure_execution_time def __update_metadata(self): - collections_list = self.db.list_collection_names() # update or create WantedPersonsRegisterServiceJson - if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 2}, limit=1) != 0): + if (self.is_collection_exists('ServiceCollection')) and ( + self.service_col.count_documents({'_id': 2}, limit=1) != 0): self.__update_service_json() logging.info('WantedPersonsRegisterServiceJson updated') else: @@ -103,10 +104,11 @@ def __update_metadata(self): @Dataset.measure_execution_time def __delete_collection_index(self): - wanted_persons_col = self.db['WantedPersons'] - if 'full_text' in wanted_persons_col.index_information(): - wanted_persons_col.drop_index('full_text') - logging.warning('WantedPersons Text index deleted') + if self.is_collection_exists('WantedPersons'): + wanted_persons_col = self.db['WantedPersons'] + if 'full_text' in wanted_persons_col.index_information(): + wanted_persons_col.drop_index('full_text') + logging.warning('WantedPersons Text index deleted') @Dataset.measure_execution_time def __create_collection_index(self): @@ -128,10 +130,10 @@ def search_into_collection(self, query_string): logging.warning('The wanted persons register: No data found') final_result = 0 else: - logging.warning('The wanted persons register: %s records found', str(result_count)) + logging.warning(f'The wanted persons register: {result_count} records found') final_result = wanted_persons_col.find({'$text': {'$search': query_string}}, - {'score': {'$meta': 'textScore'}})\ - .sort([('score', {'$meta': 'textScore'})]) + {'score': {'$meta': 'textScore'}}) \ + .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True) gc.collect() return final_result diff --git a/flaskr/src/dataset.py b/flaskr/src/dataset.py index 13fa537..79f78aa 100644 --- a/flaskr/src/dataset.py +++ b/flaskr/src/dataset.py @@ -1,14 +1,44 @@ import json import logging -from pathlib import Path -from functools import wraps from datetime import datetime +from functools import wraps +from pathlib import Path import pymongo from pymongo.errors import ServerSelectionTimeoutError class Dataset: + """Base parent class for all datasets + + -------- + Methods: + -------- + get_dataset(): + Get the link to the dataset. Return the link to the dataset's source file + save_dataset(): + Save the dataset into the Database. Input parameter - the link to the dataset's source file. + clear_collection(): + Purge the collection + __create_service_json(): + Create and save a JSON with service information about a dataset + __update_service_json(): + Update and save a JSON with service information about a dataset + update_metadata(): + Call __create_service_json() if a dataset is first time saved. Or call __update_service_json() if a dataset refreshed + delete_collection_index(): + Drop a database full-text search index + create_collection_index(): + Create a database full-text search index + search_into_collection(): + Search, show and save search results + setup_dataset(): + A sequence of class methods to setup a dataset + measure_execution_time(): + A service function / a decorator to measure up execution time + is_collection_exists(): + Check if a collection exists. Input parameter - a collection name + """ def __init__(self): self.logger = logging.getLogger(__name__) @@ -46,37 +76,62 @@ def __init__(self): logging.error( f'{self.__class__.__name__}: Config.json is not found') - def get_dataset(self): + def __get_dataset(self): + """Get the link to the dataset. + Return the link to the dataset's source file + """ pass - def save_dataset(self): + def __save_dataset(self): + """Save the dataset into the Database. + Input parameter - the link to the dataset's source file. + """ pass - def clear_collection(self): + def __clear_collection(self): + """Purge the collection + """ pass def __create_service_json(self): + """Create and save a JSON with service information about a dataset + """ pass def __update_service_json(self): + """Update and save a JSON with service information about a dataset + """ pass - def update_metadata(self): + def __update_metadata(self): + """Call __create_service_json() if a dataset is first time saved. Or call __update_service_json() if a dataset refreshed + """ pass - def delete_collection_index(self): + def __delete_collection_index(self): + """Drop a database full-text search index + """ pass - def create_collection_index(self): + def __create_collection_index(self): + """Create a database full-text search index + """ pass - def search_into_collection(self): + def search_into_collection(self, query_string): + """Search, show and save search results + """ pass def setup_dataset(self): + """A sequence of class methods to setup a dataset + """ pass def measure_execution_time(func): + """A service function / a decorator to measure up execution time + """ + @wraps(func) def log_time(*args, **kwargs): start_time = datetime.now() @@ -86,4 +141,14 @@ def log_time(*args, **kwargs): end_time = datetime.now() logging.info( f'Total execution time {args[0].__class__.__name__}.{func.__name__}: {end_time - start_time}') + return log_time + + def is_collection_exists(self, collection_name): + """Check if a collection exists. + + :param collection_name: str/a collection name + :return: True - if a collection exists; False - if not + """ + collections_list = self.db.list_collection_names() + return collection_name in collections_list diff --git a/flaskr/templates/home.html b/flaskr/templates/home.html index 004d5c3..36bbed7 100644 --- a/flaskr/templates/home.html +++ b/flaskr/templates/home.html @@ -4,11 +4,16 @@ SearchMyData-2.0 - - + + @@ -17,7 +22,8 @@ SearchMyData -
-
+
diff --git a/flaskr/templates/info.html b/flaskr/templates/info.html index aeec428..b0064bd 100644 --- a/flaskr/templates/info.html +++ b/flaskr/templates/info.html @@ -4,11 +4,16 @@ Datasets info - - + + @@ -27,32 +32,34 @@
- - - - - - + + + + + + {% if result %} - {% for row in result %} + {% for row in result %} - {% endfor %} + {% endfor %} {% endif %} {% if isExpired %} - - - + + + - {% endif %} + {% endif %}
#DescriptionDocuments countLast modified date
#DescriptionDocuments countLast modified date
{{ row['_id'] }} {{ row['Description'] }} {{ row['DocumentsCount'] }} {{ '{:.19}'.format(row['LastModifiedDate']) }}
Warning! One or more datasets are out of date. Please, refresh!
Warning! One or more datasets are out of date. + Please, refresh! +
diff --git a/flaskr/templates/result.html b/flaskr/templates/result.html index e6aa11f..b599499 100644 --- a/flaskr/templates/result.html +++ b/flaskr/templates/result.html @@ -4,11 +4,17 @@ Results - - + + @@ -42,40 +48,59 @@ - +
- - - - - - - - + + + + + + + + {% if resultMissingPersons != 0 %} - {% for row in resultMissingPersons %} + {% for row in resultMissingPersons %} @@ -84,36 +109,38 @@ - {% endfor %} + {% endfor %} {% elif resultMissingPersons == 0 %} - - - + + + - {% endif %} + {% endif %}
Інформація про безвісно зниклих громадян
LAST NAMEFIRST NAMEMIDDLE NAMEBIRTH DATELOST PLACELOST DATE
LAST NAMEFIRST NAMEMIDDLE NAMEBIRTH DATELOST PLACELOST DATE
{{ row['LAST_NAME_U'] }} {{ row['FIRST_NAME_U'] }}{{ row['LOST_PLACE'] }} {{ '{:.10}'.format(row['LOST_DATE']) }}
The missing persons register: No data found
The missing persons register: No data + found +
- - - - - - - - - - - + + + + + + + + + + + {% if resultWantedPersons != 0 %} - {% for row in resultWantedPersons %} + {% for row in resultWantedPersons %} @@ -125,33 +152,35 @@ - {% endfor %} + {% endfor %} {% elif resultWantedPersons == 0 %} - - - + + + - {% endif %} + {% endif %}
Інформація про осіб, які переховуються від органів влади
LAST NAMEFIRST NAMEMIDDLE NAMEBIRTH DATELOST PLACELOST DATECATEGORYWHO IS SEARCHINGCRIME
LAST NAMEFIRST NAMEMIDDLE NAMEBIRTH DATELOST PLACELOST DATECATEGORYWHO IS SEARCHINGCRIME
{{ row['LAST_NAME_U'] }} {{ row['FIRST_NAME_U'] }}{{ row['OVD'] }} {{ row['ARTICLE_CRIM'] }}
The wanted persons register: No data found
The wanted persons register: No data + found +
- - - - - - - - + + + + + + + + {% if resultDebtors != 0 %} - {% for row in resultDebtors %} + {% for row in resultDebtors %} @@ -160,34 +189,35 @@ - {% endfor %} + {% endfor %} {% elif resultDebtors == 0 %} - - - + + + - {% endif %} + {% endif %}
Єдиний реєстр боржників
DEBTOR NAMEDEBTOR CODEPUBLISHEREXECUTIVE SERVICEEXECUTIVE SERVICE EMPLOYEECATEGORY
DEBTOR NAMEDEBTOR CODEPUBLISHEREXECUTIVE SERVICEEXECUTIVE SERVICE EMPLOYEECATEGORY
{{ row['DEBTOR_NAME'] }} {{ row['DEBTOR_CODE'] }}{{ row['EMP_FULL_FIO'] }} {{ row['VD_CAT'] }}
The debtors register: No data found
The debtors register: No data found +
- - - - - - - - - + + + + + + + + + {% if resultLegalEntities != 0 %} - {% for row in resultLegalEntities %} + {% for row in resultLegalEntities %} @@ -197,46 +227,85 @@ - {% endfor %} + {% endfor %} {% elif resultLegalEntities == 0 %} - - - + + + - {% endif %} + {% endif %}
Єдиний державний реєстр юридичних осіб та громадських формувань
SHORT NAMEEDRPOUADDRESSKVEDBOSSFOUNDERSSTATE
SHORT NAMEEDRPOUADDRESSKVEDBOSSFOUNDERSSTATE
{{ row['short_name'] }} {{ row['edrpou'] }}{{ row['founders'] }} {{ row['stan'] }}
The legal entities register: No data found
The legal entities register: No data + found +
- - - - - - + + + + + + {% if resultEntrepreneurs != 0 %} - {% for row in resultEntrepreneurs %} + {% for row in resultEntrepreneurs %} - {% endfor %} + {% endfor %} {% elif resultEntrepreneurs == 0 %} + + + + + {% endif %} +
Єдиний державний реєстр фізичних осіб – підприємців
NAMEADDRESSKVEDSTATE
NAMEADDRESSKVEDSTATE
{{ row['fio'] }} {{ row['address'] }} {{ row['kved'] }} {{ row['stan'] }}
The Entrepreneurs register: No data + found +
+
+
+ + + + + + + + + + + {% if resultLustrated != 0 %} + + {% for row in resultLustrated %} - + + + + + {% endfor %} + + {% elif resultLustrated == 0 %} + + + + - {% endif %} + {% endif %}
Єдиний державний реєстр осіб, щодо яких застосовано положення Закону України «Про очищення + влади» +
NAMEJOBJUDGMENT COMPOSITIONPERIOD
The Entrepreneurs register: No data found{{ row['fio'] }}{{ row['job'] }}{{ row['judgment_composition'] }}{{ row['period'] }}
The Lustrated Persons register: No data + found +
@@ -256,25 +325,26 @@ // When the user scrolls down 20px from the top of the document, show the button window.onscroll = function () { - scrollFunction(); + scrollFunction(); }; function scrollFunction() { - if ( - document.body.scrollTop > 20 || - document.documentElement.scrollTop > 20 - ) { - mybutton.style.display = "block"; - } else { - mybutton.style.display = "none"; - } + if ( + document.body.scrollTop > 20 || + document.documentElement.scrollTop > 20 + ) { + mybutton.style.display = "block"; + } else { + mybutton.style.display = "none"; + } } + // When the user clicks on the button, scroll to the top of the document mybutton.addEventListener("click", backToTop); function backToTop() { - document.body.scrollTop = 0; - document.documentElement.scrollTop = 0; + document.body.scrollTop = 0; + document.documentElement.scrollTop = 0; } diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4fde6bf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pymongo>=3.12.1 +requests>=2.26.0 +dask>=2021.11.2 +Flask>=2.0.2 \ No newline at end of file