-
Notifications
You must be signed in to change notification settings - Fork 6
Database implementation #92
base: main
Are you sure you want to change the base?
Changes from all commits
f7fc4a9
b5d9819
099e126
bce7644
7a76009
fc8a221
5e89889
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,15 +2,56 @@ | |
services/databases""" | ||
import requests # pylint: disable=import-error | ||
import logging | ||
import numpy as np # pylint: disable=import-error | ||
import firebase_admin | ||
from firebase_admin import credentials | ||
from firebase_admin import firestore | ||
import constants | ||
import env | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
|
||
class Keywords: | ||
"""Class for storing articles. | ||
""" | ||
def __init__(self, keyword): | ||
self.keyword = keyword | ||
self.articles_that_contain_keyword = {} | ||
|
||
def to_dict(self): | ||
article_dict = { | ||
"keyword": self.number, | ||
"contain": self.articles_that_contain_keyword, | ||
} | ||
return article_dict | ||
|
||
@staticmethod | ||
def from_dict(src): | ||
self.number = src["number"] | ||
self.id = src["id"] | ||
self.content = src["keyword"] | ||
self.wordCount = src["frequency"] | ||
|
||
cred = credentials.ApplicationDefault() | ||
firebase_admin.initialize_app(cred, { | ||
'projectId': 'major-tom-285619', | ||
jaimehisao marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}) | ||
|
||
db = firestore.client() | ||
|
||
articles_in_memory = {} | ||
keywords_in_memory = {} | ||
|
||
|
||
def get_documents_to_parse_db(): | ||
documents_ref = db.collection(u'documents') | ||
docs = documents_ref.stream() | ||
|
||
for doc in docs: | ||
jaimehisao marked this conversation as resolved.
Show resolved
Hide resolved
|
||
print(f'{doc.id} => {doc.to_dict()}') | ||
return docs | ||
|
||
|
||
def get_documents_to_parse(): | ||
# When database is integrated, this will go away | ||
document_list = [] | ||
|
@@ -61,6 +102,34 @@ def get_articles_that_match_keywords(keywords_list): | |
return matching_articles | ||
|
||
|
||
def get_articles_by_tfidf_value(keywords_list): | ||
jaimehisao marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Returns a value for every article based on a keyword | ||
for a keyword list, value is based on | ||
term frequency inverse document frequency (tfidf) | ||
Args: | ||
keywords_list (list): Keyword(s) to look for | ||
|
||
Returns: | ||
list: articles and value for such keyword(s) | ||
""" | ||
matching_articles = {} | ||
for keyword in keywords_list: | ||
articles_that_match_keyword = {} | ||
if keyword in keywords_in_memory: | ||
for article in keywords_in_memory[keyword]: | ||
# tfidf computation | ||
word_count = articles_in_memory[str(article["number"])]["wordCount"] | ||
term_density_in_article = article["frequency"]/word_count | ||
document_frequency = len(articles_in_memory)/len(keywords_in_memory[keyword]) | ||
inverse_doc_freq = np.log(document_frequency) | ||
weight = term_density_in_article * inverse_doc_freq | ||
|
||
articles_that_match_keyword[str(article["number"])] = {"weight": weight} | ||
matching_articles[keyword] = articles_that_match_keyword | ||
return matching_articles | ||
|
||
|
||
def save_keywords_in_memory(keywords, article): | ||
"""Saves the keywords from an article in memory | ||
|
||
|
@@ -83,3 +152,56 @@ def store_article(article_dict): | |
articles_in_memory[article_dict["id"]] = article_dict | ||
save_keywords_in_memory(get_keywords(article_dict["content"]), article_dict) | ||
logging.info('Article ' + article_dict["id"] + ' assigned keywords') | ||
|
||
|
||
def store_article_in_db(article_dict): | ||
db.collection(u'articles').document(article_dict["id"]).set(article_dict) | ||
save_keywords_in_db(get_keywords(article_dict["content"]), article_dict) | ||
logging.info('Article ' + article_dict["id"] + ' assigned keywords') | ||
|
||
|
||
def save_keywords_in_db(keywords, article): | ||
"""Saves the keywords from an article in memory | ||
|
||
Args: | ||
keywords (JSON): contains keywords | ||
article (Article): article object | ||
""" | ||
for keyword in keywords: | ||
frequency = article["content"].count(keyword) | ||
|
||
doc_ref = db.collection(u'keywords').where('keyword', '==', keyword) | ||
doc = doc_ref.get() | ||
|
||
if len(doc) != 0 and doc[0] is not None: | ||
from_db = doc[0].to_dict() | ||
print(from_db) | ||
from_db["matching_articles"][article["id"]] = frequency | ||
#print(from_db) | ||
jaimehisao marked this conversation as resolved.
Show resolved
Hide resolved
|
||
db.collection(u'keywords').document(doc[0].id).set(from_db) | ||
else: | ||
to_send = {"keyword": keyword, "matching_articles": {article["id"]: frequency}} | ||
db.collection(u'keywords').add(to_send) | ||
Comment on lines
+163
to
+184
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like you're trying to mirror the same structure that you had for the in-memory implementation, which is not necessarily the best implementation. For example, |
||
|
||
|
||
def get_articles_that_match_keywords_db(keywords_list): | ||
matching_articles = {} | ||
for keyword in keywords_list: | ||
articles_that_match_keyword = {} | ||
doc_ref = db.collection(u'keywords').where('keyword', '==', keyword) | ||
doc = doc_ref.get() | ||
if doc.exists(): | ||
doc_dict = doc.to_dict() | ||
for article in doc_dict[keyword]: | ||
articles_that_match_keyword[str(article["id"])] = {"weight": article["frequency"]} | ||
matching_articles[keyword] = articles_that_match_keyword | ||
return matching_articles | ||
|
||
|
||
def get_article_by_id_db(art_num): | ||
documents_ref = db.collection(u'articles').document(art_num) | ||
doc = documents_ref.get() | ||
if doc is not None: | ||
return doc.to_dict() | ||
else: | ||
return None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should return some info for debugging purposes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something like your last PR where you added more error handling? |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,7 +38,7 @@ class Article: | |
def __init__(self, number, content): | ||
self.number = number | ||
self.content = content | ||
self.id = str(number) | ||
self.id = 'monterrey'+str(number) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this ID be generated by Firebase instead? |
||
|
||
def to_dict(self): | ||
article_dict = { | ||
|
@@ -49,6 +49,13 @@ def to_dict(self): | |
} | ||
return article_dict | ||
|
||
@staticmethod | ||
def from_dict(src): | ||
self.number = src["number"] | ||
self.id = src["id"] | ||
self.content = src["content"] | ||
self.wordCount = src["wordCount"] | ||
|
||
|
||
def identify_articles(pdf_text): | ||
"""Identifies articles and returns a list of Article objects. | ||
|
@@ -113,4 +120,6 @@ def parse(document_to_parse): | |
|
||
for article in articles: | ||
dictionary = article.to_dict() | ||
connector.store_article(dictionary) | ||
connector.store_article_in_db(dictionary) | ||
|
||
#parse_all_documents() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,5 @@ Flask | |
utils | ||
https://github.com/timClicks/slate/archive/master.zip | ||
requests | ||
pytest-mock | ||
numpy | ||
pytest-mock |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As @anniefu mentioned, you'll probably want to just have a
db_connector.py
that defines methods with the same names that you already use in the code, so that you can just change the import statement and not have to change any code.