initial commit

ymilhahn · Mar 10, 2018 · 9e57aa5 · 9e57aa5
commit 9e57aa5
Show file tree

Hide file tree

Showing 7 changed files with 1,103 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -0,0 +1,74 @@
+Altmetrics for DOIs
+===================
+A Python script which requests [Altmetric scores](https://api.altmetric.com) for a given list of DOIs of scientific articles and writes the scores to a csv file.
+
+## Background
+Altmetrics are an alternative way to measure the impact of research in the scientific community and on society. Altmetric.com gathers scores from a variety of sources, for example mentions on twitter or in mass media articles.  
+Through the systematic analysis of altmetric scores it can be measured which research gets attention by society or by the media. One can for example investigate which papers had a public impact over a certain amount of time.
+
+## Installation and execution
+Python 3.x is required.  
+See official Python Docs: [Python Setup and Usage](https://docs.python.org/3/using/index.html)
+
+## Procedure
+### 1. Manual: Create file with DOIs
+- Source:
+	- can be exported e.g. from scopus.com
+	- can be extracted from plaintext (via `extract_dois_from_text.py`)
+
+
+### 2. Automatic via get_altmetric_scores_for_dois.py
+- Asks for the file containing the DOIs
+- Sends a request for each DOI to the Altmetric API
+- Writes all Altmetric scores for each DOI into a single csv file
+- Creates an error file containing all DOIs for which the request wasn't successful
+
+### 3. Manual: Check results file
+- Compare the number of rows (plus the number of errors) to the initial number of DOIs to see if the request finished prematurely
+
+## In- and Output
+
+### Input
+
+#### CSV file with DOIs
+
+| Example | [example/myDOIs.csv](./example/myDOIs.csv) |
+|-----------------|-------------------|
+| File format     | csv               |
+| First row       | Header: "DOI"     |
+| Following rows  | one row = one DOI |
+
+Consists of one column containing the DOIs of all scientific papers that you want to examine.
+
+### Output
+#### results file
+
+| Example | [example/myDOIs_result.csv](./example/myDOIs_result.csv) |
+|-----------------|-------------------------------|
+| File format     | csv                           |
+| First row       | column names                  |
+| Following rows  | one row = one DOI/publication |
+
+- `Title` – title of the text corresponding to the DOI
+- `Cited by...` – the various altmetric scores as integers, for example the number of times the text was mentioned in a tweet (`cited_by_tweeters_count`) or in a media article (`cited_by_msm_count`)
+
+#### error file
+
+| Example | [example/myDOIs_errors.txt](./example/myDOIs_errors.txt) |
+|-----------------|-----|
+| File format     | txt |
+
+A list of all errors that occurred, for example all DOIs that were not found on the Altmetric servers.
+
+## API and Data Usage
+The API is maintained by Altmetric.  
+All data received over the API is [licensed by Altmetric.com](https://api.altmetric.com/index.html#datalicense) and must be attributed.
+
+Consider to [register an API key](https://www.altmetric.com/research-access/) if you need a high volume of data.
+
+
+## License
+- **Conception:** Prof. Dr. Markus Lehmkuhl (KIT & FU Berlin)
+- **Implementation:** Yannick Milhahn (TU Berlin & FU Berlin), Clarissa Elisabeth Staudt (TU Berlin & FU Berlin)
+
+Distributed under GPLv3 License. See LICENSE for more information.
diff --git a/example/myDOIs.csv b/example/myDOIs.csv
@@ -0,0 +1,4 @@
+DOI
+10.1016/s0140-6736(07)61306-3
+10.1176/appi.ajp.2011.11060940
+10.1007/s00439-009-0694-x
diff --git a/example/myDOIs_errors.txt b/example/myDOIs_errors.txt
@@ -0,0 +1,2 @@
+Errors: 
+DOIs not found:
diff --git a/example/myDOIs_result.csv b/example/myDOIs_result.csv
@@ -0,0 +1,4 @@
+DOI;Title;Autor;Year_published;cited_by_posts_count;cited_by_delicious_count;cited_by_fbwalls_count;cited_by_feeds_count;cited_by_forum_count;cited_by_gplus_count;cited_by_linkedin_count;cited_by_msm_count;cited_by_peer_review_sites_count;cited_by_pinners_count;cited_by_policies_count;cited_by_qs_count;cited_by_rdts_count;cited_by_rh_count;cited_by_tweeters_count;cited_by_videos_count;cited_by_weibo_count;cited_by_wikipedia_count 
+10.1016/s0140-6736(07)61306-3;"Food additives and hyperactive behaviour in 3-year-old and 8/9-year-old children in the community: a randomised, double-blinded, placebo-controlled trial.";McCann;2007;143;NULL;22;19;NULL;2;NULL;24;NULL;NULL;4;NULL;2;NULL;53;1;NULL;1;
+10.1176/appi.ajp.2011.11060940;"Striatal Dopamine Transporter Alterations in ADHD: Pathophysiology or Adaptation to Psychostimulants? A Meta-Analysis.";Balottin;2012;10;NULL;NULL;NULL;NULL;NULL;NULL;1;NULL;NULL;NULL;NULL;NULL;NULL;7;NULL;NULL;2;
+10.1007/s00439-009-0694-x;"Candidate gene studies of ADHD: a meta-analytic review";Gizer;2009;5;NULL;NULL;NULL;NULL;NULL;NULL;NULL;NULL;NULL;1;NULL;NULL;NULL;1;NULL;NULL;3;
diff --git a/extract_dois_from_text.py b/extract_dois_from_text.py
@@ -0,0 +1,117 @@
+# imports
+import csv
+import re
+import sys
+
+
+# Regex pattern for DOI
+# Source: https://github.com/bcaller/markdown_doi/blob/master/markdown_doi/md_doi.py
+doi_pattern = r'''(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+\b(?![\d\-_@]))'''
+doi_word = r'''\bdoi\b'''
+# counts the number of the word DOI - used for tracking possible errors
+doi_count = 0
+
+def getInputFile():
+    input_fi = ""
+    # check if file name was provied as parameter
+    if len(sys.argv) > 1:
+        input_fi = str(sys.argv[1])
+
+    # otherwise ask for it
+    else:
+        print("Please specify your input file. \nExample: myFulltext.txt")
+        input_fi = input("Enter file name: ")
+    return input_fi
+
+def readInDocument(input_file):
+    try:
+        with open(input_file, "r", newline='') as csvfile:
+            print("\nReading document.\n")
+            line_count = 0
+            document = ""
+            reader = csv.reader(csvfile)
+            for row in reader:
+                for element in row:
+                    document = document + element
+                    document = document + "\n"
+                line_count = line_count + 1
+            print("Read ", line_count, "lines.")
+            return document
+    except (OSError, IOError) as e:
+        print("ERROR: Input file not found.\n")
+        sys.exit(1)
+
+
+def matchDOIs(document):
+    # matches all DOis and word "doi"
+    matches_doi = re.findall(doi_pattern, document, re.DOTALL)
+    matches_word = re.findall(doi_word, document, re.DOTALL | re.IGNORECASE)
+
+    # counter
+    n_doi = len(matches_doi)
+
+    # notifies the user if no dois  could be matched
+    if n_doi ==  0:
+        printErrors("WARNING: No DOIs found.\n")
+    # tries to find out if regex might have missed some dois
+    if len(matches_word) !=  n_doi:
+        printErrors("WARNING: DOIs could not be matched correctly.\nThe word count of 'doi' or 'DOI' does not match the number of DOIs identified.")
+        printErrors("Found " + str(len(matches_word)) + " mentions of the phrase 'doi' and 'DOI'.\n")
+
+    print("Found", n_doi, "DOIs.\n")
+
+    # shows all doi matches
+    # print("The following matches were found:")
+    # print(matches_doi, "\n")
+
+    return matches_doi
+
+def printErrors(error):
+    # TODO:
+    # could print errors to the new csv
+    print(error)
+
+
+def createOutputFile(input_file_name):
+    # prints all identified dois in new csv
+    print("Creating output file with all identified DOIs.\n")
+
+    output_file = input_file_name + "_dois.csv"
+
+    output_dois = open(output_file,"w")
+    output_dois.write("DOI\n")
+
+    counter = 0
+    for doi in matches_doi:
+        if counter != len(matches_doi) - 1:
+            dois = doi + "\n"
+        else:
+            dois = doi
+        output_dois.write(dois)
+        counter = counter + 1
+    output_dois.close()
+    print("Output file with the name", output_file, "was created.\n")
+
+
+#
+# main function
+#
+
+if __name__ == "__main__":
+    # reads in or asks user for input file
+    input_file = getInputFile()
+
+    if input_file != "":
+        # reads in the provided document
+        document = readInDocument(input_file)
+
+        # matches the dois
+        matches_doi = matchDOIs(document)
+
+        # creates an output file containing all dois
+        input_file_name = input_file.rsplit(".", 1)[0] # remove file extension from name
+        createOutputFile(input_file_name)
+
+    else:
+        print("\nERROR: Please enter a valid file name.\n")
+        getInputFile()