diff --git a/.gitignore b/.gitignore
index 7e86cf0..5a008fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,3 +138,4 @@ dmypy.json
# Cython debug symbols
cython_debug/
+.idea/
\ No newline at end of file
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/git_toolbox_prj.xml b/.idea/git_toolbox_prj.xml
new file mode 100644
index 0000000..b382006
--- /dev/null
+++ b/.idea/git_toolbox_prj.xml
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index 9d9d7f9..57e018c 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,13 @@
*The application is under migrating to the Flask framework. Not all functionalities are available now.*
-# Welcome to the SearchMyData-2.0 App! The web version of [the SearchMyData App](https://github.com/AMProduction/SearchMyData)
+# Welcome to the SearchMyData-2.0 App! The web version of [the SearchMyData App](https://github.com/AMProduction/SearchMyData/wiki)
## Description
The app gives the possibility to perform a search into the Ukrainian government [Open data portal](https://data.gov.ua/en/) datasets.
-At this moment (October 2021) integrated [Information on missing citizens](https://data.gov.ua/en/dataset/470196d3-4e7a-46b0-8c0c-883b74ac65f0), [Information about people hiding from the authorities](https://data.gov.ua/en/dataset/7c51c4a0-104b-4540-a166-e9fc58485c1b), [Unified register of debtors](https://data.gov.ua/dataset/506734bf-2480-448c-a2b4-90b6d06df11e) and [Unified State Register of Legal Entities, Individual Entrepreneurs and Public Associations](https://data.gov.ua/dataset/1c7f3815-3259-45e0-bdf1-64dca07ddc10).
+At this moment (November 2021) integrated [Information on missing citizens](https://data.gov.ua/en/dataset/470196d3-4e7a-46b0-8c0c-883b74ac65f0),
+[Information about people hiding from the authorities](https://data.gov.ua/en/dataset/7c51c4a0-104b-4540-a166-e9fc58485c1b),
+[Unified register of debtors](https://data.gov.ua/dataset/506734bf-2480-448c-a2b4-90b6d06df11e),
+[Unified State Register of Legal Entities, Individual Entrepreneurs and Public Associations](https://data.gov.ua/dataset/1c7f3815-3259-45e0-bdf1-64dca07ddc10)
+and [Integrated Unified State Register of Lustrated Persons](https://data.gov.ua/dataset/8faa71c1-3a54-45e8-8f6e-06c92b1ff8bc).
## Hardware requirements
* 16Gb+ RAM
* SSD
-## See additional info into [the SearchMyData-2.0 App wiki](https://github.com/AMProduction/SearchMyData/wiki)
\ No newline at end of file
+## See additional info into [the SearchMyData-2.0 App wiki](https://github.com/AMProduction/SearchMyData-2.0/wiki)
\ No newline at end of file
diff --git a/changelog.md b/changelog.md
new file mode 100644
index 0000000..d32892c
--- /dev/null
+++ b/changelog.md
@@ -0,0 +1,2 @@
+### v2.0. First release. XX/XX/2021
+* ???
\ No newline at end of file
diff --git a/flaskr/__init__.py b/flaskr/__init__.py
index 8c8ed2b..cbeda2c 100644
--- a/flaskr/__init__.py
+++ b/flaskr/__init__.py
@@ -24,12 +24,14 @@ def get_search_results():
from .src.LegalEntitiesRegister import LegalEntitiesRegister
from .src.MissingPersonsRegister import MissingPersonsRegister
from .src.WantedPersonsRegister import WantedPersonsRegister
+ from .src.LustratedPersonsRegister import LustratedPersonsRegister
# create instances
missing_persons = MissingPersonsRegister()
wanted_persons = WantedPersonsRegister()
debtors = DebtorsRegister()
legal_entities = LegalEntitiesRegister()
entrepreneurs = EntrepreneursRegister()
+ lustrated = LustratedPersonsRegister()
if request.method == 'POST':
search_string = request.form['search']
# call search methods
@@ -38,8 +40,10 @@ def get_search_results():
result_debtors = debtors.search_into_collection(search_string)
result_legal_entities = legal_entities.search_into_collection(search_string)
result_entrepreneurs = entrepreneurs.search_into_collection(search_string)
+ result_lustrated = lustrated.search_into_collection(search_string)
return render_template('result.html', resultMissingPersons=result_missing_persons,
resultWantedPersons=result_wanted_persons, resultDebtors=result_debtors,
- resultLegalEntities=result_legal_entities, resultEntrepreneurs=result_entrepreneurs)
+ resultLegalEntities=result_legal_entities, resultEntrepreneurs=result_entrepreneurs,
+ resultLustrated=result_lustrated)
return app
diff --git a/flaskr/src/DebtorsRegister.py b/flaskr/src/DebtorsRegister.py
index 37d6842..6af8828 100644
--- a/flaskr/src/DebtorsRegister.py
+++ b/flaskr/src/DebtorsRegister.py
@@ -1,17 +1,16 @@
import gc
import json
import logging
+import mmap
import os
import shutil
import zipfile
-import mmap
-import requests
from datetime import datetime
from io import BytesIO
-from pymongo.errors import PyMongoError
-
+import requests
from dask import dataframe as dd
+from pymongo.errors import PyMongoError
from .dataset import Dataset
@@ -71,7 +70,7 @@ def __save_dataset(self, zip_url):
# convert CSV to JSON using Dask
debtors_csv.to_json('debtorsJson')
for file in os.listdir('debtorsJson'):
- file_object = open('debtorsJson/'+file, mode='r')
+ file_object = open('debtorsJson/' + file, mode='r')
# map the entire file into memory, size 0 means whole file, normally much faster than buffered i/o
mm = mmap.mmap(file_object.fileno(), 0, access=mmap.ACCESS_READ)
# iterate over the block, until next newline
@@ -93,10 +92,11 @@ def __save_dataset(self, zip_url):
@Dataset.measure_execution_time
def __clear_collection(self):
- debtors_col = self.db['Debtors']
- count_deleted_documents = debtors_col.delete_many({})
- logging.warning('%s documents deleted. The wanted persons collection is empty.', str(
- count_deleted_documents.deleted_count))
+ if self.is_collection_exists('Debtors'):
+ debtors_col = self.db['Debtors']
+ count_deleted_documents = debtors_col.delete_many({})
+ logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The wanted persons '
+ f'collection is empty.')
@Dataset.measure_execution_time
def __create_service_json(self):
@@ -126,9 +126,9 @@ def __update_service_json(self):
@Dataset.measure_execution_time
def __update_metadata(self):
- collections_list = self.db.list_collection_names()
# update or create DebtorsRegisterServiceJson
- if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 3}, limit=1) != 0):
+ if (self.is_collection_exists('ServiceCollection')) and (
+ self.service_col.count_documents({'_id': 3}, limit=1) != 0):
self.__update_service_json()
logging.info('DebtorsRegisterServiceJson updated')
else:
@@ -137,10 +137,11 @@ def __update_metadata(self):
@Dataset.measure_execution_time
def __delete_collection_index(self):
- debtors_col = self.db['Debtors']
- if 'full_text' in debtors_col.index_information():
- debtors_col.drop_index('full_text')
- logging.warning('Debtors Text index deleted')
+ if self.is_collection_exists('Debtors'):
+ debtors_col = self.db['Debtors']
+ if 'full_text' in debtors_col.index_information():
+ debtors_col.drop_index('full_text')
+ logging.warning('Debtors Text index deleted')
@Dataset.measure_execution_time
def __create_collection_index(self):
@@ -153,16 +154,16 @@ def search_into_collection(self, query_string):
debtors_col = self.db['Debtors']
final_result = 0
try:
- resultCount = debtors_col.count_documents({'$text': {'$search': query_string}})
+ result_count = debtors_col.count_documents({'$text': {'$search': query_string}})
except PyMongoError:
logging.error('Error during search into Debtors Register')
else:
- if resultCount == 0:
+ if result_count == 0:
logging.warning('The debtors register: No data found')
final_result = 0
else:
- logging.warning('The debtors register: %s records found', str(resultCount))
- final_result = debtors_col.find({'$text': {'$search': query_string}}, {'score': {'$meta': 'textScore'}})\
+ logging.warning(f'The debtors register: {result_count} records found')
+ final_result = debtors_col.find({'$text': {'$search': query_string}}, {'score': {'$meta': 'textScore'}}) \
.sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
gc.collect()
return final_result
diff --git a/flaskr/src/EntrepreneursRegister.py b/flaskr/src/EntrepreneursRegister.py
index 4a280a2..c5bc3b0 100644
--- a/flaskr/src/EntrepreneursRegister.py
+++ b/flaskr/src/EntrepreneursRegister.py
@@ -1,8 +1,8 @@
import gc
import logging
from datetime import datetime
-from pymongo.errors import PyMongoError
+from pymongo.errors import PyMongoError
from .dataset import Dataset
@@ -21,10 +21,11 @@ def save_dataset(self):
@Dataset.measure_execution_time
def clear_collection(self):
- entrepreneurs_col = self.db['Entrepreneurs']
- count_deleted_documents = entrepreneurs_col.delete_many({})
- logging.warning('%s documents deleted. The entrepreneurs collection is empty.', str(
- count_deleted_documents.deleted_count))
+ if self.is_collection_exists('Entrepreneurs'):
+ entrepreneurs_col = self.db['Entrepreneurs']
+ count_deleted_documents = entrepreneurs_col.delete_many({})
+ logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The entrepreneurs collection '
+ f'is empty.')
@Dataset.measure_execution_time
def __create_service_json(self):
@@ -54,9 +55,9 @@ def __update_service_json(self):
@Dataset.measure_execution_time
def update_metadata(self):
- collections_list = self.db.list_collection_names()
# update or create EntrepreneursRegisterServiceJson
- if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 5}, limit=1) != 0):
+ if (self.is_collection_exists('ServiceCollection')) and (
+ self.service_col.count_documents({'_id': 5}, limit=1) != 0):
self.__update_service_json()
logging.info('EntrepreneursRegisterServiceJson updated')
else:
@@ -65,10 +66,11 @@ def update_metadata(self):
@Dataset.measure_execution_time
def delete_collection_index(self):
- entrepreneurs_col = self.db['Entrepreneurs']
- if 'full_text' in entrepreneurs_col.index_information():
- entrepreneurs_col.drop_index('full_text')
- logging.warning('Entrepreneurs Text index deleted')
+ if self.is_collection_exists('Entrepreneurs'):
+ entrepreneurs_col = self.db['Entrepreneurs']
+ if 'full_text' in entrepreneurs_col.index_information():
+ entrepreneurs_col.drop_index('full_text')
+ logging.warning('Entrepreneurs Text index deleted')
@Dataset.measure_execution_time
def create_collection_index(self):
@@ -81,17 +83,17 @@ def search_into_collection(self, query_string):
entrepreneurs_col = self.db['Entrepreneurs']
final_result = 0
try:
- resultCount = entrepreneurs_col.count_documents({'$text': {'$search': query_string}})
+ result_count = entrepreneurs_col.count_documents({'$text': {'$search': query_string}})
except PyMongoError:
logging.error('Error during search into Entrepreneurs Register')
else:
- if resultCount == 0:
+ if result_count == 0:
logging.warning('The Entrepreneurs register: No data found')
final_result = 0
else:
- logging.warning('The Entrepreneurs register: %s records found', str(resultCount))
+ logging.warning(f'The Entrepreneurs register: {result_count} records found')
final_result = entrepreneurs_col.find({'$text': {'$search': query_string}},
- {'score': {'$meta': 'textScore'}})\
- .sort([('score',{'$meta': 'textScore'})]).allow_disk_use(True)
+ {'score': {'$meta': 'textScore'}}) \
+ .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
gc.collect()
return final_result
diff --git a/flaskr/src/LegalEntitiesRegister.py b/flaskr/src/LegalEntitiesRegister.py
index 8e6c74a..2d2245a 100644
--- a/flaskr/src/LegalEntitiesRegister.py
+++ b/flaskr/src/LegalEntitiesRegister.py
@@ -5,11 +5,11 @@
import shutil
import xml.etree.ElementTree as ET
import zipfile
-import requests
from datetime import datetime
from io import BytesIO
-from pymongo.errors import PyMongoError
+import requests
+from pymongo.errors import PyMongoError
from .dataset import Dataset
@@ -65,7 +65,7 @@ def save_dataset(self, zip_url):
logging.warning('File in ZIP: ' + str(xml_file))
# unzip all files
entrepreneurs_zip.extractall('Temp')
- for xml_file in os.listdir('Temp/'+root_folder_name):
+ for xml_file in os.listdir('Temp/' + root_folder_name):
if xml_file.find('_UO_') != -1:
# read the legal Entities Xml file
path_to_file = 'Temp/' + root_folder_name + xml_file
@@ -81,14 +81,14 @@ def save_dataset(self, zip_url):
kved = record.find('KVED').text
boss = record.find('BOSS').text
beneficiaries_dict = {}
- beneficiaryNumber = 1
+ beneficiary_number = 1
for beneficiaries in record.iter('BENEFICIARIES'):
if beneficiaries.find('BENEFICIARY') is not None:
for beneficiary in beneficiaries.iter('BENEFICIARY'):
beneficiary_to_dict = beneficiary.text
- key = 'beneficiary' + str(beneficiaryNumber)
+ key = 'beneficiary' + str(beneficiary_number)
beneficiaries_dict[key] = beneficiary_to_dict
- beneficiaryNumber += 1
+ beneficiary_number += 1
founders_dict = {}
founders_number = 1
for founders in record.iter('FOUNDERS'):
@@ -147,10 +147,11 @@ def save_dataset(self, zip_url):
@Dataset.measure_execution_time
def clear_collection(self):
- legal_entities_col = self.db['LegalEntities']
- count_deleted_documents = legal_entities_col.delete_many({})
- logging.warning('%s documents deleted. The legal entities collection is empty.', str(
- count_deleted_documents.deleted_count))
+ if self.is_collection_exists('LegalEntities'):
+ legal_entities_col = self.db['LegalEntities']
+ count_deleted_documents = legal_entities_col.delete_many({})
+ logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The legal entities '
+ f'collection is empty.')
@Dataset.measure_execution_time
def __create_service_json(self):
@@ -180,9 +181,9 @@ def __update_service_json(self):
@Dataset.measure_execution_time
def update_metadata(self):
- collections_list = self.db.list_collection_names()
# update or create LegalEntitiesRegisterServiceJson
- if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 4}, limit=1) != 0):
+ if (self.is_collection_exists('ServiceCollection')) and (
+ self.service_col.count_documents({'_id': 4}, limit=1) != 0):
self.__update_service_json()
logging.info('LegalEntitiesRegisterServiceJson updated')
else:
@@ -191,10 +192,11 @@ def update_metadata(self):
@Dataset.measure_execution_time
def delete_collection_index(self):
- legal_entities_col = self.db['LegalEntities']
- if 'full_text' in legal_entities_col.index_information():
- legal_entities_col.drop_index('full_text')
- logging.warning('LegalEntities Text index deleted')
+ if self.is_collection_exists('LegalEntities'):
+ legal_entities_col = self.db['LegalEntities']
+ if 'full_text' in legal_entities_col.index_information():
+ legal_entities_col.drop_index('full_text')
+ logging.warning('LegalEntities Text index deleted')
@Dataset.measure_execution_time
def create_collection_index(self):
@@ -216,9 +218,9 @@ def search_into_collection(self, query_string):
logging.warning('The legal entities register: No data found')
final_result = 0
else:
- logging.warning('The legal entities register: %s records found', str(result_count))
+ logging.warning(f'The legal entities register: {result_count} records found')
final_result = legal_entities_col.find({'$text': {'$search': query_string}},
- {'score': {'$meta': 'textScore'}})\
+ {'score': {'$meta': 'textScore'}}) \
.sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
gc.collect()
return final_result
diff --git a/flaskr/src/LustratedPersonsRegister.py b/flaskr/src/LustratedPersonsRegister.py
new file mode 100644
index 0000000..03a424a
--- /dev/null
+++ b/flaskr/src/LustratedPersonsRegister.py
@@ -0,0 +1,184 @@
+import gc
+import json
+import logging
+import os
+import shutil
+import xml.etree.ElementTree as ET
+import zipfile
+from datetime import datetime
+from io import BytesIO
+
+import requests
+from pymongo.errors import PyMongoError
+
+from .dataset import Dataset
+
+
+class LustratedPersonsRegister(Dataset):
+ def __init__(self):
+ super().__init__()
+
+ @Dataset.measure_execution_time
+ def __get_dataset(self):
+ try:
+ general_dataset = requests.get(
+ 'https://data.gov.ua/api/3/action/package_show?id=8faa71c1-3a54-45e8-8f6e-06c92b1ff8bc').text
+ except ConnectionError:
+ logging.error('Error during general LustratedPersonsRegister dataset JSON receiving occurred')
+ else:
+ general_dataset_json = json.loads(general_dataset)
+ logging.info('A general LustratedPersonsRegister dataset JSON received')
+ # get dataset id
+ lustrated_persons_general_dataset_id = general_dataset_json['result']['resources'][0]['id']
+ try:
+ # get resources JSON id
+ lustrated_persons_general_dataset_id_json = requests.get(
+ 'https://data.gov.ua/api/3/action/resource_show?id=' + lustrated_persons_general_dataset_id).text
+ except ConnectionError:
+ logging.error('Error during LustratedPersonsRegister resources JSON id receiving occurred')
+ else:
+ lustrated_persons_general_dataset_json = json.loads(lustrated_persons_general_dataset_id_json)
+ logging.info('A LustratedPersonsRegister resources JSON id received')
+ # get ZIP url
+ lustrated_persons_dataset_zip_url = lustrated_persons_general_dataset_json['result']['url']
+ return lustrated_persons_dataset_zip_url
+
+ @Dataset.measure_execution_time
+ def __save_dataset(self, zip_url):
+ lustrated_col = self.db['Lustrated']
+ try:
+ # get ZIP file
+ lustrated_dataset_zip = requests.get(zip_url).content
+ except OSError:
+ logging.error('Error during LustratedPersonsRegister ZIP receiving occurred')
+ else:
+ logging.info('A LustratedPersonsRegister dataset received')
+ # get lists of files
+ lustrated_zip = zipfile.ZipFile(BytesIO(lustrated_dataset_zip), 'r')
+ # go inside ZIP
+ root_folder_name = ''
+ for xml_file in lustrated_zip.namelist():
+ # skip root folder
+ if xml_file.endswith('/'):
+ root_folder_name = xml_file
+ continue
+ logging.warning('File in ZIP: ' + str(xml_file))
+ # unzip
+ lustrated_zip.extractall('Temp')
+ lustrated_zip.close()
+ for xml_file in os.listdir('Temp/' + root_folder_name):
+ # read the lustrated persons Xml file
+ path_to_file = 'Temp/' + root_folder_name + xml_file
+ # parse xml
+ lustrated_json = {}
+ tree = ET.parse(path_to_file)
+ xml_data = tree.getroot()
+ for record in xml_data:
+ fio = record.find('FIO').text
+ job = record.find('JOB').text
+ judgment_composition = record.find('JUDGMENT_COMPOSITION').text
+ period = record.find('PERIOD').text
+ lustrated_json = {
+ 'fio': fio,
+ 'job': job,
+ 'judgment_composition': judgment_composition,
+ 'period': period
+ }
+ try:
+ # save to the collection
+ lustrated_col.insert_one(lustrated_json)
+ except PyMongoError:
+ logging.error('Error during saving Lustrated Persons Register into Database')
+ logging.info('Lustrated Persons dataset was saved into the database')
+ finally:
+ # delete temp files
+ shutil.rmtree('Temp', ignore_errors=True)
+ gc.collect()
+
+ @Dataset.measure_execution_time
+ def __clear_collection(self):
+ if self.is_collection_exists('Lustrated'):
+ lustrated_col = self.db['Lustrated']
+ count_deleted_documents = lustrated_col.delete_many({})
+ logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The Lustrated Persons '
+ f'collection is empty.')
+
+ @Dataset.measure_execution_time
+ def __create_service_json(self):
+ created_date = datetime.now()
+ last_modified_date = datetime.now()
+ lustrated_col = self.db['Lustrated']
+ documents_count = lustrated_col.count_documents({})
+ lustrated_register_service_json = {
+ '_id': 6,
+ 'Description': 'Єдиний державний реєстр осіб, щодо яких застосовано положення Закону України «Про очищення влади»',
+ 'DocumentsCount': documents_count,
+ 'CreatedDate': str(created_date),
+ 'LastModifiedDate': str(last_modified_date)
+ }
+ self.service_col.insert_one(lustrated_register_service_json)
+
+ @Dataset.measure_execution_time
+ def __update_service_json(self):
+ last_modified_date = datetime.now()
+ lustrated_col = self.db['Lustrated']
+ documents_count = lustrated_col.count_documents({})
+ self.service_col.update_one(
+ {'_id': 6},
+ {'$set': {'LastModifiedDate': str(last_modified_date),
+ 'DocumentsCount': documents_count}}
+ )
+
+ @Dataset.measure_execution_time
+ def __update_metadata(self):
+ # update or create LustratedPersonsRegisterServiceJson
+ if (self.is_collection_exists('ServiceCollection')) and (
+ self.service_col.count_documents({'_id': 6}, limit=1) != 0):
+ self.__update_service_json()
+ logging.info('LustratedPersonsRegisterServiceJson updated')
+ else:
+ self.__create_service_json()
+ logging.info('LustratedPersonsRegisterServiceJson created')
+
+ @Dataset.measure_execution_time
+ def __delete_collection_index(self):
+ if self.is_collection_exists('Lustrated'):
+ lustrated_col = self.db['Lustrated']
+ if 'full_text' in lustrated_col.index_information():
+ lustrated_col.drop_index('full_text')
+ logging.warning('Lustrated Text index deleted')
+
+ @Dataset.measure_execution_time
+ def __create_collection_index(self):
+ lustrated_col = self.db['Lustrated']
+ lustrated_col.create_index([('fio', 'text')], name='full_text')
+ logging.info('Lustrated Text Index created')
+
+ @Dataset.measure_execution_time
+ def search_into_collection(self, query_string):
+ lustrated_col = self.db['Lustrated']
+ final_result = 0
+ try:
+ result_count = lustrated_col.count_documents({'$text': {'$search': query_string}})
+ except PyMongoError:
+ logging.error('Error during search into The Lustrated Persons Register')
+ else:
+ if result_count == 0:
+ logging.warning('The Lustrated Persons register: No data found')
+ final_result = 0
+ else:
+ logging.warning(f'The Lustrated Persons register: {result_count} records found')
+ final_result = lustrated_col.find({'$text': {'$search': query_string}},
+ {'score': {'$meta': 'textScore'}}) \
+ .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
+ gc.collect()
+ return final_result
+
+ @Dataset.measure_execution_time
+ def setup_dataset(self):
+ self.__delete_collection_index()
+ self.__clear_collection()
+ __lustrated_dataset_zip_url = self.__get_dataset()
+ self.__save_dataset(__lustrated_dataset_zip_url)
+ self.__update_metadata()
+ self.__create_collection_index()
diff --git a/flaskr/src/MissingPersonsRegister.py b/flaskr/src/MissingPersonsRegister.py
index ab3432d..4a3bedd 100644
--- a/flaskr/src/MissingPersonsRegister.py
+++ b/flaskr/src/MissingPersonsRegister.py
@@ -1,10 +1,10 @@
import gc
import json
import logging
-import requests
from datetime import datetime
-from pymongo.errors import PyMongoError
+import requests
+from pymongo.errors import PyMongoError
from .dataset import Dataset
@@ -59,10 +59,11 @@ def __save_dataset(self, json):
@Dataset.measure_execution_time
def __clear_collection(self):
- missing_persons_col = self.db['MissingPersons']
- count_deleted_documents = missing_persons_col.delete_many({})
- logging.warning('%s documents deleted. The missing persons collection is empty.', str(
- count_deleted_documents.deleted_count))
+ if self.is_collection_exists('MissingPersons'):
+ missing_persons_col = self.db['MissingPersons']
+ count_deleted_documents = missing_persons_col.delete_many({})
+ logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The missing persons '
+ f'collection is empty.')
@Dataset.measure_execution_time
def __create_service_json(self):
@@ -92,9 +93,9 @@ def __update_service_json(self):
@Dataset.measure_execution_time
def __update_metadata(self):
- collections_list = self.db.list_collection_names()
# update or create MissingPersonsRegisterServiceJson
- if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 1}, limit=1) != 0):
+ if (self.is_collection_exists('ServiceCollection')) and (
+ self.service_col.count_documents({'_id': 1}, limit=1) != 0):
self.__update_service_json()
logging.info('MissingPersonsRegisterServiceJson updated')
else:
@@ -103,10 +104,11 @@ def __update_metadata(self):
@Dataset.measure_execution_time
def __delete_collection_index(self):
- missing_persons_col = self.db['MissingPersons']
- if 'full_text' in missing_persons_col.index_information():
- missing_persons_col.drop_index('full_text')
- logging.warning('Missing persons Text index deleted')
+ if self.is_collection_exists('MissingPersons'):
+ missing_persons_col = self.db['MissingPersons']
+ if 'full_text' in missing_persons_col.index_information():
+ missing_persons_col.drop_index('full_text')
+ logging.warning('Missing persons Text index deleted')
@Dataset.measure_execution_time
def __create_collection_index(self):
@@ -128,10 +130,10 @@ def search_into_collection(self, query_string):
logging.warning('The missing persons register: No data found')
final_result = 0
else:
- logging.warning('The missing persons register: %s records found', str(result_count))
+ logging.warning(f'The missing persons register: {result_count} records found')
final_result = missing_persons_col.find({'$text': {'$search': query_string}},
- {'score': {'$meta': 'textScore'}})\
- .sort([('score', {'$meta': 'textScore'})])
+ {'score': {'$meta': 'textScore'}}) \
+ .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
gc.collect()
return final_result
diff --git a/flaskr/src/ServiceTools.py b/flaskr/src/ServiceTools.py
index 14d8adc..18cea3d 100644
--- a/flaskr/src/ServiceTools.py
+++ b/flaskr/src/ServiceTools.py
@@ -1,15 +1,23 @@
import json
import logging
-import os
-from pathlib import Path
from datetime import datetime, timedelta
+from pathlib import Path
import pymongo
-from prettytable import PrettyTable
from pymongo.errors import ServerSelectionTimeoutError
class ServiceTools:
+ """Class for service purpose
+
+ --------
+ Methods:
+ --------
+ get_registers_info():
+ Show the list of available registers, count of documents, and the last update date
+ check_is_expired():
+ Check if a dataset last updated date is older than 2 days ago
+ """
def __init__(self):
self.logger = logging.getLogger(__name__)
@@ -48,16 +56,20 @@ def __init__(self):
f'{self.__class__.__name__}: Config.json is not found')
def get_registers_info(self):
- result = self.__serviceCol.find({}, {'_id': 1, 'Description': 1, 'DocumentsCount': 1, 'LastModifiedDate': 1})\
+ """Show the list of available registers, count of documents, and the last update date
+ """
+ result = self.__serviceCol.find({}, {'_id': 1, 'Description': 1, 'DocumentsCount': 1, 'LastModifiedDate': 1}) \
.sort([('_id', 1)])
return result
def check_is_expired(self):
+ """Check if a dataset last updated date is older than 2 days ago
+ """
is_expired = False
expired_time = datetime.now() - timedelta(days=2)
for record in self.__serviceCol.find():
last_modified_date = datetime.strptime(record['LastModifiedDate'], '%Y-%m-%d %H:%M:%S.%f')
if last_modified_date < expired_time:
- logging.warning(record['Description'] + ' is out of date')
+ logging.warning(f'{record["Description"]} is out of date')
is_expired = True
return is_expired
diff --git a/flaskr/src/WantedPersonsRegister.py b/flaskr/src/WantedPersonsRegister.py
index 1a151b5..04a8f43 100644
--- a/flaskr/src/WantedPersonsRegister.py
+++ b/flaskr/src/WantedPersonsRegister.py
@@ -1,10 +1,10 @@
import gc
import json
import logging
-import requests
from datetime import datetime
-from pymongo.errors import PyMongoError
+import requests
+from pymongo.errors import PyMongoError
from .dataset import Dataset
@@ -59,10 +59,11 @@ def __save_dataset(self, json):
@Dataset.measure_execution_time
def __clear_collection(self):
- wanted_persons_col = self.db['WantedPersons']
- count_deleted_documents = wanted_persons_col.delete_many({})
- logging.warning('%s documents deleted. The wanted persons collection is empty.', str(
- count_deleted_documents.deleted_count))
+ if self.is_collection_exists('WantedPersons'):
+ wanted_persons_col = self.db['WantedPersons']
+ count_deleted_documents = wanted_persons_col.delete_many({})
+ logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The wanted persons '
+ f'collection is empty.')
@Dataset.measure_execution_time
def __create_service_json(self):
@@ -92,9 +93,9 @@ def __update_service_json(self):
@Dataset.measure_execution_time
def __update_metadata(self):
- collections_list = self.db.list_collection_names()
# update or create WantedPersonsRegisterServiceJson
- if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 2}, limit=1) != 0):
+ if (self.is_collection_exists('ServiceCollection')) and (
+ self.service_col.count_documents({'_id': 2}, limit=1) != 0):
self.__update_service_json()
logging.info('WantedPersonsRegisterServiceJson updated')
else:
@@ -103,10 +104,11 @@ def __update_metadata(self):
@Dataset.measure_execution_time
def __delete_collection_index(self):
- wanted_persons_col = self.db['WantedPersons']
- if 'full_text' in wanted_persons_col.index_information():
- wanted_persons_col.drop_index('full_text')
- logging.warning('WantedPersons Text index deleted')
+ if self.is_collection_exists('WantedPersons'):
+ wanted_persons_col = self.db['WantedPersons']
+ if 'full_text' in wanted_persons_col.index_information():
+ wanted_persons_col.drop_index('full_text')
+ logging.warning('WantedPersons Text index deleted')
@Dataset.measure_execution_time
def __create_collection_index(self):
@@ -128,10 +130,10 @@ def search_into_collection(self, query_string):
logging.warning('The wanted persons register: No data found')
final_result = 0
else:
- logging.warning('The wanted persons register: %s records found', str(result_count))
+ logging.warning(f'The wanted persons register: {result_count} records found')
final_result = wanted_persons_col.find({'$text': {'$search': query_string}},
- {'score': {'$meta': 'textScore'}})\
- .sort([('score', {'$meta': 'textScore'})])
+ {'score': {'$meta': 'textScore'}}) \
+ .sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
gc.collect()
return final_result
diff --git a/flaskr/src/dataset.py b/flaskr/src/dataset.py
index 13fa537..79f78aa 100644
--- a/flaskr/src/dataset.py
+++ b/flaskr/src/dataset.py
@@ -1,14 +1,44 @@
import json
import logging
-from pathlib import Path
-from functools import wraps
from datetime import datetime
+from functools import wraps
+from pathlib import Path
import pymongo
from pymongo.errors import ServerSelectionTimeoutError
class Dataset:
+ """Base parent class for all datasets
+
+ --------
+ Methods:
+ --------
+ get_dataset():
+ Get the link to the dataset. Return the link to the dataset's source file
+ save_dataset():
+ Save the dataset into the Database. Input parameter - the link to the dataset's source file.
+ clear_collection():
+ Purge the collection
+ __create_service_json():
+ Create and save a JSON with service information about a dataset
+ __update_service_json():
+ Update and save a JSON with service information about a dataset
+ update_metadata():
+ Call __create_service_json() if a dataset is first time saved. Or call __update_service_json() if a dataset refreshed
+ delete_collection_index():
+ Drop a database full-text search index
+ create_collection_index():
+ Create a database full-text search index
+ search_into_collection():
+ Search, show and save search results
+ setup_dataset():
+ A sequence of class methods to setup a dataset
+ measure_execution_time():
+ A service function / a decorator to measure up execution time
+ is_collection_exists():
+ Check if a collection exists. Input parameter - a collection name
+ """
def __init__(self):
self.logger = logging.getLogger(__name__)
@@ -46,37 +76,62 @@ def __init__(self):
logging.error(
f'{self.__class__.__name__}: Config.json is not found')
- def get_dataset(self):
+ def __get_dataset(self):
+ """Get the link to the dataset.
+ Return the link to the dataset's source file
+ """
pass
- def save_dataset(self):
+ def __save_dataset(self):
+ """Save the dataset into the Database.
+ Input parameter - the link to the dataset's source file.
+ """
pass
- def clear_collection(self):
+ def __clear_collection(self):
+ """Purge the collection
+ """
pass
def __create_service_json(self):
+ """Create and save a JSON with service information about a dataset
+ """
pass
def __update_service_json(self):
+ """Update and save a JSON with service information about a dataset
+ """
pass
- def update_metadata(self):
+ def __update_metadata(self):
+ """Call __create_service_json() if a dataset is first time saved. Or call __update_service_json() if a dataset refreshed
+ """
pass
- def delete_collection_index(self):
+ def __delete_collection_index(self):
+ """Drop a database full-text search index
+ """
pass
- def create_collection_index(self):
+ def __create_collection_index(self):
+ """Create a database full-text search index
+ """
pass
- def search_into_collection(self):
+ def search_into_collection(self, query_string):
+ """Search, show and save search results
+ """
pass
def setup_dataset(self):
+ """A sequence of class methods to setup a dataset
+ """
pass
def measure_execution_time(func):
+ """A service function / a decorator to measure up execution time
+ """
+
@wraps(func)
def log_time(*args, **kwargs):
start_time = datetime.now()
@@ -86,4 +141,14 @@ def log_time(*args, **kwargs):
end_time = datetime.now()
logging.info(
f'Total execution time {args[0].__class__.__name__}.{func.__name__}: {end_time - start_time}')
+
return log_time
+
+ def is_collection_exists(self, collection_name):
+ """Check if a collection exists.
+
+ :param collection_name: str/a collection name
+ :return: True - if a collection exists; False - if not
+ """
+ collections_list = self.db.list_collection_names()
+ return collection_name in collections_list
diff --git a/flaskr/templates/home.html b/flaskr/templates/home.html
index 004d5c3..36bbed7 100644
--- a/flaskr/templates/home.html
+++ b/flaskr/templates/home.html
@@ -4,11 +4,16 @@
SearchMyData-2.0
-
-
+
+
@@ -17,7 +22,8 @@
SearchMyData
-