Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The codebase synced #5

Merged
merged 2 commits into from
Nov 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,4 @@ dmypy.json
# Cython debug symbols
cython_debug/

.idea/
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions .idea/git_toolbox_prj.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
*The application is under migrating to the Flask framework. Not all functionalities are available now.*
# Welcome to the SearchMyData-2.0 App! The web version of [the SearchMyData App](https://github.com/AMProduction/SearchMyData)
# Welcome to the SearchMyData-2.0 App! The web version of [the SearchMyData App](https://github.com/AMProduction/SearchMyData/wiki)
## Description
The app gives the possibility to perform a search into the Ukrainian government [Open data portal](https://data.gov.ua/en/) datasets.
At this moment (October 2021) integrated [Information on missing citizens](https://data.gov.ua/en/dataset/470196d3-4e7a-46b0-8c0c-883b74ac65f0), [Information about people hiding from the authorities](https://data.gov.ua/en/dataset/7c51c4a0-104b-4540-a166-e9fc58485c1b), [Unified register of debtors](https://data.gov.ua/dataset/506734bf-2480-448c-a2b4-90b6d06df11e) and [Unified State Register of Legal Entities, Individual Entrepreneurs and Public Associations](https://data.gov.ua/dataset/1c7f3815-3259-45e0-bdf1-64dca07ddc10).
At this moment (November 2021) integrated [Information on missing citizens](https://data.gov.ua/en/dataset/470196d3-4e7a-46b0-8c0c-883b74ac65f0),
[Information about people hiding from the authorities](https://data.gov.ua/en/dataset/7c51c4a0-104b-4540-a166-e9fc58485c1b),
[Unified register of debtors](https://data.gov.ua/dataset/506734bf-2480-448c-a2b4-90b6d06df11e),
[Unified State Register of Legal Entities, Individual Entrepreneurs and Public Associations](https://data.gov.ua/dataset/1c7f3815-3259-45e0-bdf1-64dca07ddc10)
and [Integrated Unified State Register of Lustrated Persons](https://data.gov.ua/dataset/8faa71c1-3a54-45e8-8f6e-06c92b1ff8bc).
## Hardware requirements
* 16Gb+ RAM
* SSD
## See additional info into [the SearchMyData-2.0 App wiki](https://github.com/AMProduction/SearchMyData/wiki)
## See additional info into [the SearchMyData-2.0 App wiki](https://github.com/AMProduction/SearchMyData-2.0/wiki)
2 changes: 2 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
### v2.0. First release. XX/XX/2021
* ???
6 changes: 5 additions & 1 deletion flaskr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@ def get_search_results():
from .src.LegalEntitiesRegister import LegalEntitiesRegister
from .src.MissingPersonsRegister import MissingPersonsRegister
from .src.WantedPersonsRegister import WantedPersonsRegister
from .src.LustratedPersonsRegister import LustratedPersonsRegister
# create instances
missing_persons = MissingPersonsRegister()
wanted_persons = WantedPersonsRegister()
debtors = DebtorsRegister()
legal_entities = LegalEntitiesRegister()
entrepreneurs = EntrepreneursRegister()
lustrated = LustratedPersonsRegister()
if request.method == 'POST':
search_string = request.form['search']
# call search methods
Expand All @@ -38,8 +40,10 @@ def get_search_results():
result_debtors = debtors.search_into_collection(search_string)
result_legal_entities = legal_entities.search_into_collection(search_string)
result_entrepreneurs = entrepreneurs.search_into_collection(search_string)
result_lustrated = lustrated.search_into_collection(search_string)
return render_template('result.html', resultMissingPersons=result_missing_persons,
resultWantedPersons=result_wanted_persons, resultDebtors=result_debtors,
resultLegalEntities=result_legal_entities, resultEntrepreneurs=result_entrepreneurs)
resultLegalEntities=result_legal_entities, resultEntrepreneurs=result_entrepreneurs,
resultLustrated=result_lustrated)

return app
39 changes: 20 additions & 19 deletions flaskr/src/DebtorsRegister.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import gc
import json
import logging
import mmap
import os
import shutil
import zipfile
import mmap
import requests
from datetime import datetime
from io import BytesIO
from pymongo.errors import PyMongoError


import requests
from dask import dataframe as dd
from pymongo.errors import PyMongoError

from .dataset import Dataset

Expand Down Expand Up @@ -71,7 +70,7 @@ def __save_dataset(self, zip_url):
# convert CSV to JSON using Dask
debtors_csv.to_json('debtorsJson')
for file in os.listdir('debtorsJson'):
file_object = open('debtorsJson/'+file, mode='r')
file_object = open('debtorsJson/' + file, mode='r')
# map the entire file into memory, size 0 means whole file, normally much faster than buffered i/o
mm = mmap.mmap(file_object.fileno(), 0, access=mmap.ACCESS_READ)
# iterate over the block, until next newline
Expand All @@ -93,10 +92,11 @@ def __save_dataset(self, zip_url):

@Dataset.measure_execution_time
def __clear_collection(self):
debtors_col = self.db['Debtors']
count_deleted_documents = debtors_col.delete_many({})
logging.warning('%s documents deleted. The wanted persons collection is empty.', str(
count_deleted_documents.deleted_count))
if self.is_collection_exists('Debtors'):
debtors_col = self.db['Debtors']
count_deleted_documents = debtors_col.delete_many({})
logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The wanted persons '
f'collection is empty.')

@Dataset.measure_execution_time
def __create_service_json(self):
Expand Down Expand Up @@ -126,9 +126,9 @@ def __update_service_json(self):

@Dataset.measure_execution_time
def __update_metadata(self):
collections_list = self.db.list_collection_names()
# update or create DebtorsRegisterServiceJson
if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 3}, limit=1) != 0):
if (self.is_collection_exists('ServiceCollection')) and (
self.service_col.count_documents({'_id': 3}, limit=1) != 0):
self.__update_service_json()
logging.info('DebtorsRegisterServiceJson updated')
else:
Expand All @@ -137,10 +137,11 @@ def __update_metadata(self):

@Dataset.measure_execution_time
def __delete_collection_index(self):
debtors_col = self.db['Debtors']
if 'full_text' in debtors_col.index_information():
debtors_col.drop_index('full_text')
logging.warning('Debtors Text index deleted')
if self.is_collection_exists('Debtors'):
debtors_col = self.db['Debtors']
if 'full_text' in debtors_col.index_information():
debtors_col.drop_index('full_text')
logging.warning('Debtors Text index deleted')

@Dataset.measure_execution_time
def __create_collection_index(self):
Expand All @@ -153,16 +154,16 @@ def search_into_collection(self, query_string):
debtors_col = self.db['Debtors']
final_result = 0
try:
resultCount = debtors_col.count_documents({'$text': {'$search': query_string}})
result_count = debtors_col.count_documents({'$text': {'$search': query_string}})
except PyMongoError:
logging.error('Error during search into Debtors Register')
else:
if resultCount == 0:
if result_count == 0:
logging.warning('The debtors register: No data found')
final_result = 0
else:
logging.warning('The debtors register: %s records found', str(resultCount))
final_result = debtors_col.find({'$text': {'$search': query_string}}, {'score': {'$meta': 'textScore'}})\
logging.warning(f'The debtors register: {result_count} records found')
final_result = debtors_col.find({'$text': {'$search': query_string}}, {'score': {'$meta': 'textScore'}}) \
.sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
gc.collect()
return final_result
Expand Down
34 changes: 18 additions & 16 deletions flaskr/src/EntrepreneursRegister.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import gc
import logging
from datetime import datetime
from pymongo.errors import PyMongoError

from pymongo.errors import PyMongoError

from .dataset import Dataset

Expand All @@ -21,10 +21,11 @@ def save_dataset(self):

@Dataset.measure_execution_time
def clear_collection(self):
entrepreneurs_col = self.db['Entrepreneurs']
count_deleted_documents = entrepreneurs_col.delete_many({})
logging.warning('%s documents deleted. The entrepreneurs collection is empty.', str(
count_deleted_documents.deleted_count))
if self.is_collection_exists('Entrepreneurs'):
entrepreneurs_col = self.db['Entrepreneurs']
count_deleted_documents = entrepreneurs_col.delete_many({})
logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The entrepreneurs collection '
f'is empty.')

@Dataset.measure_execution_time
def __create_service_json(self):
Expand Down Expand Up @@ -54,9 +55,9 @@ def __update_service_json(self):

@Dataset.measure_execution_time
def update_metadata(self):
collections_list = self.db.list_collection_names()
# update or create EntrepreneursRegisterServiceJson
if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 5}, limit=1) != 0):
if (self.is_collection_exists('ServiceCollection')) and (
self.service_col.count_documents({'_id': 5}, limit=1) != 0):
self.__update_service_json()
logging.info('EntrepreneursRegisterServiceJson updated')
else:
Expand All @@ -65,10 +66,11 @@ def update_metadata(self):

@Dataset.measure_execution_time
def delete_collection_index(self):
entrepreneurs_col = self.db['Entrepreneurs']
if 'full_text' in entrepreneurs_col.index_information():
entrepreneurs_col.drop_index('full_text')
logging.warning('Entrepreneurs Text index deleted')
if self.is_collection_exists('Entrepreneurs'):
entrepreneurs_col = self.db['Entrepreneurs']
if 'full_text' in entrepreneurs_col.index_information():
entrepreneurs_col.drop_index('full_text')
logging.warning('Entrepreneurs Text index deleted')

@Dataset.measure_execution_time
def create_collection_index(self):
Expand All @@ -81,17 +83,17 @@ def search_into_collection(self, query_string):
entrepreneurs_col = self.db['Entrepreneurs']
final_result = 0
try:
resultCount = entrepreneurs_col.count_documents({'$text': {'$search': query_string}})
result_count = entrepreneurs_col.count_documents({'$text': {'$search': query_string}})
except PyMongoError:
logging.error('Error during search into Entrepreneurs Register')
else:
if resultCount == 0:
if result_count == 0:
logging.warning('The Entrepreneurs register: No data found')
final_result = 0
else:
logging.warning('The Entrepreneurs register: %s records found', str(resultCount))
logging.warning(f'The Entrepreneurs register: {result_count} records found')
final_result = entrepreneurs_col.find({'$text': {'$search': query_string}},
{'score': {'$meta': 'textScore'}})\
.sort([('score',{'$meta': 'textScore'})]).allow_disk_use(True)
{'score': {'$meta': 'textScore'}}) \
.sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
gc.collect()
return final_result
38 changes: 20 additions & 18 deletions flaskr/src/LegalEntitiesRegister.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import shutil
import xml.etree.ElementTree as ET
import zipfile
import requests
from datetime import datetime
from io import BytesIO
from pymongo.errors import PyMongoError

import requests
from pymongo.errors import PyMongoError

from .dataset import Dataset

Expand Down Expand Up @@ -65,7 +65,7 @@ def save_dataset(self, zip_url):
logging.warning('File in ZIP: ' + str(xml_file))
# unzip all files
entrepreneurs_zip.extractall('Temp')
for xml_file in os.listdir('Temp/'+root_folder_name):
for xml_file in os.listdir('Temp/' + root_folder_name):
if xml_file.find('_UO_') != -1:
# read the legal Entities Xml file
path_to_file = 'Temp/' + root_folder_name + xml_file
Expand All @@ -81,14 +81,14 @@ def save_dataset(self, zip_url):
kved = record.find('KVED').text
boss = record.find('BOSS').text
beneficiaries_dict = {}
beneficiaryNumber = 1
beneficiary_number = 1
for beneficiaries in record.iter('BENEFICIARIES'):
if beneficiaries.find('BENEFICIARY') is not None:
for beneficiary in beneficiaries.iter('BENEFICIARY'):
beneficiary_to_dict = beneficiary.text
key = 'beneficiary' + str(beneficiaryNumber)
key = 'beneficiary' + str(beneficiary_number)
beneficiaries_dict[key] = beneficiary_to_dict
beneficiaryNumber += 1
beneficiary_number += 1
founders_dict = {}
founders_number = 1
for founders in record.iter('FOUNDERS'):
Expand Down Expand Up @@ -147,10 +147,11 @@ def save_dataset(self, zip_url):

@Dataset.measure_execution_time
def clear_collection(self):
legal_entities_col = self.db['LegalEntities']
count_deleted_documents = legal_entities_col.delete_many({})
logging.warning('%s documents deleted. The legal entities collection is empty.', str(
count_deleted_documents.deleted_count))
if self.is_collection_exists('LegalEntities'):
legal_entities_col = self.db['LegalEntities']
count_deleted_documents = legal_entities_col.delete_many({})
logging.warning(f'{count_deleted_documents.deleted_count} documents deleted. The legal entities '
f'collection is empty.')

@Dataset.measure_execution_time
def __create_service_json(self):
Expand Down Expand Up @@ -180,9 +181,9 @@ def __update_service_json(self):

@Dataset.measure_execution_time
def update_metadata(self):
collections_list = self.db.list_collection_names()
# update or create LegalEntitiesRegisterServiceJson
if ('ServiceCollection' in collections_list) and (self.service_col.count_documents({'_id': 4}, limit=1) != 0):
if (self.is_collection_exists('ServiceCollection')) and (
self.service_col.count_documents({'_id': 4}, limit=1) != 0):
self.__update_service_json()
logging.info('LegalEntitiesRegisterServiceJson updated')
else:
Expand All @@ -191,10 +192,11 @@ def update_metadata(self):

@Dataset.measure_execution_time
def delete_collection_index(self):
legal_entities_col = self.db['LegalEntities']
if 'full_text' in legal_entities_col.index_information():
legal_entities_col.drop_index('full_text')
logging.warning('LegalEntities Text index deleted')
if self.is_collection_exists('LegalEntities'):
legal_entities_col = self.db['LegalEntities']
if 'full_text' in legal_entities_col.index_information():
legal_entities_col.drop_index('full_text')
logging.warning('LegalEntities Text index deleted')

@Dataset.measure_execution_time
def create_collection_index(self):
Expand All @@ -216,9 +218,9 @@ def search_into_collection(self, query_string):
logging.warning('The legal entities register: No data found')
final_result = 0
else:
logging.warning('The legal entities register: %s records found', str(result_count))
logging.warning(f'The legal entities register: {result_count} records found')
final_result = legal_entities_col.find({'$text': {'$search': query_string}},
{'score': {'$meta': 'textScore'}})\
{'score': {'$meta': 'textScore'}}) \
.sort([('score', {'$meta': 'textScore'})]).allow_disk_use(True)
gc.collect()
return final_result
Loading