Skip to content
This repository has been archived by the owner on May 15, 2023. It is now read-only.

Database implementation #92

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions services/parser-database/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from flask import jsonify # pylint: disable=import-error

from parser import parse_all_documents
from connector import get_articles_that_match_keywords
from connector import get_article_by_number
from connector import get_article_by_id_db
from connector import get_articles_that_match_keywords_db
Comment on lines +10 to +11
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As @anniefu mentioned, you'll probably want to just have a db_connector.py that defines methods with the same names that you already use in the code, so that you can just change the import statement and not have to change any code.


app = Flask(__name__)
app.config['JSON_AS_ASCII'] = False
Expand All @@ -32,14 +32,14 @@ def get_keywords():
logging.error(error)
return error, 400
else:
return jsonify(get_articles_that_match_keywords(json_request['keywords']))
return jsonify(get_articles_that_match_keywords_db(json_request['keywords']))


@app.route('/articles/<id>', methods=['GET'])
def get_article_by_number_in_memory(id):
"""Returns the article that matches the ID value
accoring to the apiSpec.yaml file"""
article = get_article_by_number(str(id))
article = get_article_by_id_db(str(id))
if article is not None:
article = copy(article)
return jsonify(article)
Expand All @@ -50,5 +50,5 @@ def get_article_by_number_in_memory(id):


if __name__ == '__main__':
parse_all_documents()
#parse_all_documents()
jaimehisao marked this conversation as resolved.
Show resolved Hide resolved
app.run(debug=True, host='0.0.0.0', port=os.getenv("PORT"))
122 changes: 122 additions & 0 deletions services/parser-database/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,56 @@
services/databases"""
import requests # pylint: disable=import-error
import logging
import numpy as np # pylint: disable=import-error
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import constants
import env

logging.basicConfig(level=logging.INFO)

class Keywords:
"""Class for storing articles.
"""
def __init__(self, keyword):
self.keyword = keyword
self.articles_that_contain_keyword = {}

def to_dict(self):
article_dict = {
"keyword": self.number,
"contain": self.articles_that_contain_keyword,
}
return article_dict

@staticmethod
def from_dict(src):
self.number = src["number"]
self.id = src["id"]
self.content = src["keyword"]
self.wordCount = src["frequency"]

cred = credentials.ApplicationDefault()
firebase_admin.initialize_app(cred, {
'projectId': 'major-tom-285619',
jaimehisao marked this conversation as resolved.
Show resolved Hide resolved
})

db = firestore.client()

articles_in_memory = {}
keywords_in_memory = {}


def get_documents_to_parse_db():
documents_ref = db.collection(u'documents')
docs = documents_ref.stream()

for doc in docs:
jaimehisao marked this conversation as resolved.
Show resolved Hide resolved
print(f'{doc.id} => {doc.to_dict()}')
return docs


def get_documents_to_parse():
# When database is integrated, this will go away
document_list = []
Expand Down Expand Up @@ -61,6 +102,34 @@ def get_articles_that_match_keywords(keywords_list):
return matching_articles


def get_articles_by_tfidf_value(keywords_list):
jaimehisao marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns a value for every article based on a keyword
for a keyword list, value is based on
term frequency inverse document frequency (tfidf)
Args:
keywords_list (list): Keyword(s) to look for

Returns:
list: articles and value for such keyword(s)
"""
matching_articles = {}
for keyword in keywords_list:
articles_that_match_keyword = {}
if keyword in keywords_in_memory:
for article in keywords_in_memory[keyword]:
# tfidf computation
word_count = articles_in_memory[str(article["number"])]["wordCount"]
term_density_in_article = article["frequency"]/word_count
document_frequency = len(articles_in_memory)/len(keywords_in_memory[keyword])
inverse_doc_freq = np.log(document_frequency)
weight = term_density_in_article * inverse_doc_freq

articles_that_match_keyword[str(article["number"])] = {"weight": weight}
matching_articles[keyword] = articles_that_match_keyword
return matching_articles


def save_keywords_in_memory(keywords, article):
"""Saves the keywords from an article in memory

Expand All @@ -83,3 +152,56 @@ def store_article(article_dict):
articles_in_memory[article_dict["id"]] = article_dict
save_keywords_in_memory(get_keywords(article_dict["content"]), article_dict)
logging.info('Article ' + article_dict["id"] + ' assigned keywords')


def store_article_in_db(article_dict):
db.collection(u'articles').document(article_dict["id"]).set(article_dict)
save_keywords_in_db(get_keywords(article_dict["content"]), article_dict)
logging.info('Article ' + article_dict["id"] + ' assigned keywords')


def save_keywords_in_db(keywords, article):
"""Saves the keywords from an article in memory

Args:
keywords (JSON): contains keywords
article (Article): article object
"""
for keyword in keywords:
frequency = article["content"].count(keyword)

doc_ref = db.collection(u'keywords').where('keyword', '==', keyword)
doc = doc_ref.get()

if len(doc) != 0 and doc[0] is not None:
from_db = doc[0].to_dict()
print(from_db)
from_db["matching_articles"][article["id"]] = frequency
#print(from_db)
jaimehisao marked this conversation as resolved.
Show resolved Hide resolved
db.collection(u'keywords').document(doc[0].id).set(from_db)
else:
to_send = {"keyword": keyword, "matching_articles": {article["id"]: frequency}}
db.collection(u'keywords').add(to_send)
Comment on lines +163 to +184
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like you're trying to mirror the same structure that you had for the in-memory implementation, which is not necessarily the best implementation. For example, keywords could be a field of the article document that gets queried directly, which eliminates the need to store keywords as a separate collection. See this guide on how to perform different queries against Firestore, particularly the example for array membership.



def get_articles_that_match_keywords_db(keywords_list):
matching_articles = {}
for keyword in keywords_list:
articles_that_match_keyword = {}
doc_ref = db.collection(u'keywords').where('keyword', '==', keyword)
doc = doc_ref.get()
if doc.exists():
doc_dict = doc.to_dict()
for article in doc_dict[keyword]:
articles_that_match_keyword[str(article["id"])] = {"weight": article["frequency"]}
matching_articles[keyword] = articles_that_match_keyword
return matching_articles


def get_article_by_id_db(art_num):
documents_ref = db.collection(u'articles').document(art_num)
doc = documents_ref.get()
if doc is not None:
return doc.to_dict()
else:
return None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should return some info for debugging purposes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like your last PR where you added more error handling?

13 changes: 11 additions & 2 deletions services/parser-database/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Article:
def __init__(self, number, content):
self.number = number
self.content = content
self.id = str(number)
self.id = 'monterrey'+str(number)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this ID be generated by Firebase instead?


def to_dict(self):
article_dict = {
Expand All @@ -49,6 +49,13 @@ def to_dict(self):
}
return article_dict

@staticmethod
def from_dict(src):
self.number = src["number"]
self.id = src["id"]
self.content = src["content"]
self.wordCount = src["wordCount"]


def identify_articles(pdf_text):
"""Identifies articles and returns a list of Article objects.
Expand Down Expand Up @@ -113,4 +120,6 @@ def parse(document_to_parse):

for article in articles:
dictionary = article.to_dict()
connector.store_article(dictionary)
connector.store_article_in_db(dictionary)

#parse_all_documents()
3 changes: 2 additions & 1 deletion services/parser-database/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ Flask
utils
https://github.com/timClicks/slate/archive/master.zip
requests
pytest-mock
numpy
pytest-mock
62 changes: 62 additions & 0 deletions services/parser-database/tests/test_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,35 @@
}


in_memory_value_mock_no_decimals = {
"ciclista": [
{"number": 5, "frequency": 3},
{"number": 45, "frequency": 6},
{"number": 99, "frequency": 9},
],
"licencia": [
{"number": 89, "frequency": 3},
{"number": 45, "frequency": 3},
{"number": 125, "frequency": 15},
],
}


articles_in_memory = {'5': {'wordCount': 32}, '45': {'wordCount': 40}, '89': {'wordCount': 16},
'99': {'wordCount': 50}, '125': {'wordCount': 200}}


articles_in_memory_no_wordCount = {'5': {}, '45': {}, '89': {}, '99': {}, '125': {}}


def logn(num):
"""
Mocks the natural log of a number to try to
minimize decimal points
"""
return num


@mock.patch("connector.keywords_in_memory", in_memory_value_mock)
def test_get_articles_that_match_keywords_empty_result_one_keyword():
result_to_assert_1 = {"alcohol": {}}
Expand Down Expand Up @@ -71,6 +100,39 @@ def test_get_articles_that_match_keywords_non_empty_result_two_keywords():
assert result == result_to_assert_4


@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
@mock.patch("connector.articles_in_memory", articles_in_memory)
def test_get_articles_by_tfidf_value():
expected = {
"licencia": {"89": {"weight": .3125}, "45": {"weight": .125}, "125": {"weight": .125}},
"ciclista": {"5": {"weight": .15625}, "45": {"weight": .25}, "99": {"weight": .3}},
}
keywords = ["licencia", "ciclista"]
with mock.patch("numpy.log", side_effect=logn):
assert expected == connector.get_articles_by_tfidf_value(keywords)


@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
@mock.patch("connector.articles_in_memory", articles_in_memory)
def test_get_articles_by_tfidf_value_empty_result():
expected = {
"casco": {},
"luz": {},
}
keywords = ["casco", "luz"]
with mock.patch("numpy.log", side_effect=logn):
assert expected == connector.get_articles_by_tfidf_value(keywords)


@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
@mock.patch("connector.articles_in_memory", articles_in_memory_no_wordCount)
def test_get_articles_by_tfidf_value_missing_word_count():
keywords = ["licencia", "ciclista"]
with mock.patch("numpy.log", side_effect=logn):
with pytest.raises(KeyError):
connector.get_articles_by_tfidf_value(keywords)


def test_get_documents():
assert connector.get_documents_to_parse() == [constants.mty_document]

Expand Down