googleinterns · jaimehisao · Jul 21, 2020 · Aug 3, 2020 · Aug 21, 2020 · Aug 24, 2020
diff --git a/services/parser-database/app.py b/services/parser-database/app.py
@@ -7,8 +7,8 @@
 from flask import jsonify  # pylint: disable=import-error
 
 from parser import parse_all_documents
-from connector import get_articles_that_match_keywords
-from connector import get_article_by_number
+from connector import get_article_by_id_db
+from connector import get_articles_that_match_keywords_db
 
 app = Flask(__name__)
 app.config['JSON_AS_ASCII'] = False
@@ -32,14 +32,14 @@ def get_keywords():
         logging.error(error)
         return error, 400
     else:
-        return jsonify(get_articles_that_match_keywords(json_request['keywords']))
+        return jsonify(get_articles_that_match_keywords_db(json_request['keywords']))
 
 
 @app.route('/articles/<id>', methods=['GET'])
 def get_article_by_number_in_memory(id):
     """Returns the article that matches the ID value
     accoring to the apiSpec.yaml file"""
-    article = get_article_by_number(str(id))
+    article = get_article_by_id_db(str(id))
     if article is not None:
         article = copy(article)
         return jsonify(article)
@@ -50,5 +50,5 @@ def get_article_by_number_in_memory(id):
 
 
 if __name__ == '__main__':
-    parse_all_documents()
+    #parse_all_documents()
     app.run(debug=True, host='0.0.0.0', port=os.getenv("PORT"))
diff --git a/services/parser-database/connector.py b/services/parser-database/connector.py
@@ -2,15 +2,56 @@
 services/databases"""
 import requests  # pylint: disable=import-error
 import logging
+import numpy as np  # pylint: disable=import-error
+import firebase_admin
+from firebase_admin import credentials
+from firebase_admin import firestore
 import constants
 import env
 
 logging.basicConfig(level=logging.INFO)
 
+class Keywords:
+    """Class for storing articles.
+    """
+    def __init__(self, keyword):
+        self.keyword = keyword
+        self.articles_that_contain_keyword = {}
+
+    def to_dict(self):
+        article_dict = {
+            "keyword": self.number,
+            "contain": self.articles_that_contain_keyword,
+        }
+        return article_dict
+
+    @staticmethod
+    def from_dict(src):
+        self.number = src["number"]
+        self.id = src["id"]
+        self.content = src["keyword"]
+        self.wordCount = src["frequency"]
+
+cred = credentials.ApplicationDefault()
+firebase_admin.initialize_app(cred, {
+  'projectId': 'major-tom-285619',
+})
+
+db = firestore.client()
+
 articles_in_memory = {}
 keywords_in_memory = {}
 
 
+def get_documents_to_parse_db():
+    documents_ref = db.collection(u'documents')
+    docs = documents_ref.stream()
+
+    for doc in docs:
+        print(f'{doc.id} => {doc.to_dict()}')
+    return docs
+
+
 def get_documents_to_parse():
     # When database is integrated, this will go away
     document_list = []
@@ -61,6 +102,34 @@ def get_articles_that_match_keywords(keywords_list):
     return matching_articles
 
 
+def get_articles_by_tfidf_value(keywords_list):
+    """
+    Returns a value for every article based on a keyword
+    for a keyword list, value is based on
+    term frequency inverse document frequency (tfidf)
+    Args:
+        keywords_list (list): Keyword(s) to look for
+
+    Returns:
+        list: articles and value for such keyword(s)
+    """
+    matching_articles = {}
+    for keyword in keywords_list:
+        articles_that_match_keyword = {}
+        if keyword in keywords_in_memory:
+            for article in keywords_in_memory[keyword]:
+                # tfidf computation
+                word_count = articles_in_memory[str(article["number"])]["wordCount"]
+                term_density_in_article = article["frequency"]/word_count
+                document_frequency = len(articles_in_memory)/len(keywords_in_memory[keyword])
+                inverse_doc_freq = np.log(document_frequency)
+                weight = term_density_in_article * inverse_doc_freq
+
+                articles_that_match_keyword[str(article["number"])] = {"weight": weight}
+        matching_articles[keyword] = articles_that_match_keyword
+    return matching_articles
+
+
 def save_keywords_in_memory(keywords, article):
     """Saves the keywords from an article in memory
 
@@ -83,3 +152,56 @@ def store_article(article_dict):
     articles_in_memory[article_dict["id"]] = article_dict
     save_keywords_in_memory(get_keywords(article_dict["content"]), article_dict)
     logging.info('Article ' + article_dict["id"] + ' assigned keywords')
+
+
+def store_article_in_db(article_dict):
+    db.collection(u'articles').document(article_dict["id"]).set(article_dict)
+    save_keywords_in_db(get_keywords(article_dict["content"]), article_dict)
+    logging.info('Article ' + article_dict["id"] + ' assigned keywords')
+
+
+def save_keywords_in_db(keywords, article):
+    """Saves the keywords from an article in memory
+
+    Args:
+        keywords (JSON): contains keywords
+        article (Article): article object
+    """
+    for keyword in keywords:
+        frequency = article["content"].count(keyword)
+
+        doc_ref = db.collection(u'keywords').where('keyword', '==', keyword)
+        doc = doc_ref.get()
+
+        if len(doc) != 0 and doc[0] is not None:
+            from_db = doc[0].to_dict()
+            print(from_db)
+            from_db["matching_articles"][article["id"]] = frequency
+            #print(from_db)
+            db.collection(u'keywords').document(doc[0].id).set(from_db)
+        else:
+            to_send = {"keyword": keyword, "matching_articles": {article["id"]: frequency}}
+            db.collection(u'keywords').add(to_send)
+
+
+def get_articles_that_match_keywords_db(keywords_list):
+    matching_articles = {}
+    for keyword in keywords_list:
+        articles_that_match_keyword = {}
+        doc_ref = db.collection(u'keywords').where('keyword', '==', keyword)
+        doc = doc_ref.get()
+        if doc.exists():
+            doc_dict = doc.to_dict()
+            for article in doc_dict[keyword]:
+                articles_that_match_keyword[str(article["id"])] = {"weight": article["frequency"]}
+        matching_articles[keyword] = articles_that_match_keyword
+    return matching_articles
+
+
+def get_article_by_id_db(art_num):
+    documents_ref = db.collection(u'articles').document(art_num)
+    doc = documents_ref.get()
+    if doc is not None:
+        return doc.to_dict()
+    else:
+        return None
diff --git a/services/parser-database/parser.py b/services/parser-database/parser.py
@@ -38,7 +38,7 @@ class Article:
     def __init__(self, number, content):
         self.number = number
         self.content = content
-        self.id = str(number)
+        self.id = 'monterrey'+str(number)
 
     def to_dict(self):
         article_dict = {
@@ -49,6 +49,13 @@ def to_dict(self):
         }
         return article_dict
 
+    @staticmethod
+    def from_dict(src):
+        self.number = src["number"]
+        self.id = src["id"]
+        self.content = src["content"]
+        self.wordCount = src["wordCount"]
+
 
 def identify_articles(pdf_text):
     """Identifies articles and returns a list of Article objects.
@@ -113,4 +120,6 @@ def parse(document_to_parse):
 
             for article in articles:
                 dictionary = article.to_dict()
-                connector.store_article(dictionary)
+                connector.store_article_in_db(dictionary)
+
+#parse_all_documents()
diff --git a/services/parser-database/requirements.txt b/services/parser-database/requirements.txt
@@ -2,4 +2,5 @@ Flask
 utils
 https://github.com/timClicks/slate/archive/master.zip
 requests
-pytest-mock
+numpy
+pytest-mock
diff --git a/services/parser-database/tests/test_connector.py b/services/parser-database/tests/test_connector.py
@@ -35,6 +35,35 @@
 }
 
 
+in_memory_value_mock_no_decimals = {
+    "ciclista": [
+        {"number": 5, "frequency": 3},
+        {"number": 45, "frequency": 6},
+        {"number": 99, "frequency": 9},
+    ],
+    "licencia": [
+        {"number": 89, "frequency": 3},
+        {"number": 45, "frequency": 3},
+        {"number": 125, "frequency": 15},
+    ],
+}
+
+
+articles_in_memory = {'5': {'wordCount': 32}, '45': {'wordCount': 40}, '89': {'wordCount': 16},
+                      '99': {'wordCount': 50}, '125': {'wordCount': 200}}
+
+
+articles_in_memory_no_wordCount = {'5': {}, '45': {}, '89': {}, '99': {}, '125': {}}
+
+
+def logn(num):
+    """
+    Mocks the natural log of a number to try to
+    minimize decimal points
+    """
+    return num
+
+
 @mock.patch("connector.keywords_in_memory", in_memory_value_mock)
 def test_get_articles_that_match_keywords_empty_result_one_keyword():
     result_to_assert_1 = {"alcohol": {}}
@@ -71,6 +100,39 @@ def test_get_articles_that_match_keywords_non_empty_result_two_keywords():
     assert result == result_to_assert_4
 
 
+@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
+@mock.patch("connector.articles_in_memory", articles_in_memory)
+def test_get_articles_by_tfidf_value():
+    expected = {
+        "licencia": {"89": {"weight": .3125}, "45": {"weight": .125}, "125": {"weight": .125}},
+        "ciclista": {"5": {"weight": .15625}, "45": {"weight": .25}, "99": {"weight": .3}},
+    }
+    keywords = ["licencia", "ciclista"]
+    with mock.patch("numpy.log", side_effect=logn):
+        assert expected == connector.get_articles_by_tfidf_value(keywords)
+
+
+@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
+@mock.patch("connector.articles_in_memory", articles_in_memory)
+def test_get_articles_by_tfidf_value_empty_result():
+    expected = {
+        "casco": {},
+        "luz": {},
+    }
+    keywords = ["casco", "luz"]
+    with mock.patch("numpy.log", side_effect=logn):
+        assert expected == connector.get_articles_by_tfidf_value(keywords)
+
+
+@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
+@mock.patch("connector.articles_in_memory", articles_in_memory_no_wordCount)
+def test_get_articles_by_tfidf_value_missing_word_count():
+    keywords = ["licencia", "ciclista"]
+    with mock.patch("numpy.log", side_effect=logn):
+        with pytest.raises(KeyError):
+            connector.get_articles_by_tfidf_value(keywords)
+
+
 def test_get_documents():
     assert connector.get_documents_to_parse() == [constants.mty_document]