From ee2b1d6058bf15a84fa3bcd4b28a9d9e5a3961a4 Mon Sep 17 00:00:00 2001 From: gurdeep330 Date: Fri, 7 Jun 2024 14:04:08 +0200 Subject: [PATCH] fix: snippet to update paper ids, and snippet to fix author name --- .../literature_fetch_recommendation_api.py | 16 ++++++++- app/code/utils.py | 34 ++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/app/code/literature_fetch_recommendation_api.py b/app/code/literature_fetch_recommendation_api.py index 02c33ef4..d9b028a0 100755 --- a/app/code/literature_fetch_recommendation_api.py +++ b/app/code/literature_fetch_recommendation_api.py @@ -115,6 +115,11 @@ def create_template(template, if len(paper_obj.authors) > 0: continue for author in paper_data['authors']: + # if the author id is None, set it to the name + if author['authorId'] is None: + author['authorId'] = author['name'] + print (f'Author ID is None for {author["name"]}. Setting it to the name.') + # Add the author to the article paper_obj.authors.append( Author(author['authorId'], author_name=author['name'])) @@ -132,6 +137,11 @@ def create_template(template, # rec_paper_obj.add_paper_details(rec_paper_data) ## for author in rec_paper_data['authors']: + # if the author id is None, set it to the name + if author['authorId'] is None: + author['authorId'] = author['name'] + print (f'Author ID is None for {author["name"]}. Setting it to the name.') + # Add the author to the article rec_paper_obj.authors.append( Author(author['authorId'], author_name=author['name'])) @@ -180,6 +190,11 @@ def create_template(template, paper_obj = Article(paper_data['paperId']) utils.add_paper_details(paper_obj, paper_data) for author in paper_data['authors']: + # if the author id is None, set it to the name + if author['authorId'] is None: + author['authorId'] = author['name'] + print (f'Author ID is None for {author["name"]}. Setting it to the name.') + # Add the author to the article paper_obj.authors.append( Author(author['authorId'], author_name=author['name'])) @@ -188,7 +203,6 @@ def create_template(template, # Get the metrics over time df = utils.metrics_over_time_js(topic_obj.paper_ids['recommended']) authors_ids = topic_obj.get_all_authors_ids() # Get all the authors of the articles - # print (authors_ids) author_details = utils.get_author_details(authors_ids) # Get the details of the authors for article_type in topic_obj.paper_ids: for article_id, article_obj in topic_obj.paper_ids[article_type].items(): diff --git a/app/code/utils.py b/app/code/utils.py index 6add57a4..b5bc586a 100755 --- a/app/code/utils.py +++ b/app/code/utils.py @@ -5,6 +5,7 @@ ''' import sys +import re import matplotlib.pyplot as plt import pandas as pd import yaml @@ -47,6 +48,21 @@ def update_paper_details(topic_obj): all_paper_ids += list(topic_obj.paper_ids['negative'].keys()) all_paper_ids = list(set(all_paper_ids)) all_paper_data = get_paper_details(all_paper_ids) + # Check if the paper id matches the paper data + # If not, change the paper id to the new paper id + for paper_id, paper_data in zip(all_paper_ids, all_paper_data): + if paper_id == paper_data['paperId']: + continue + print (f'Paper ID {paper_id} does not match {paper_data["paperId"]}.\ + Changing the paper ID.') + if paper_id in topic_obj.paper_ids['positive']: + # change the paper id in the positive articles + topic_obj.paper_ids['positive'][paper_data['paperId']] = \ + topic_obj.paper_ids['positive'].pop(paper_id) + elif paper_id in topic_obj.paper_ids['negative']: + # change the paper id in the negative articles + topic_obj.paper_ids['negative'][paper_data['paperId']] = \ + topic_obj.paper_ids['negative'].pop(paper_id) return all_paper_data def add_paper_details(article_obj, article_data): @@ -81,6 +97,8 @@ def update_h_index(article_obj, dic): author.h_index = row['hIndex'] author.name = row['name'] author.citation_count = row['citationCount'] + if row['hIndex'] is None: + continue authors_h_index_list.append(row['hIndex']) if len(authors_h_index_list) == 0: authors_avg_h_index = 0 @@ -215,7 +233,7 @@ def get_paper_details(paper_ids, fields=FIELDS): status_code = search_response.status_code return search_response.json() -def get_author_details(authors_ids): +def get_author_details(all_authors_ids): """ Get the author details @@ -225,6 +243,19 @@ def get_author_details(authors_ids): Returns: authors_details (list): list of authors details """ + # Some authors have no ids assigned, and in that case their ID is their name + # So here we exclude such authors and already prepare their output + author_details_wo_id = [] + authors_ids = [] + for author_id in all_authors_ids: + # check if author id contains only alphabets + if re.fullmatch(r'[A-Za-z ]+', author_id): + author_details_wo_id.append({'authorId': author_id, + 'hIndex': None, + 'name': author_id, + 'citationCount': None}) + continue + authors_ids.append(author_id) # Loop over every 1000 authors authors_details = [] for start_index in range(0, len(authors_ids), 1000): @@ -252,6 +283,7 @@ def get_author_details(authors_ids): search_response.json()) sys.exit() authors_details += search_response.json() + authors_details += author_details_wo_id return authors_details def metrics_over_time_js(data) -> plt: