datacommonsorg · SudhishaK · Dec 17, 2024 · Dec 19, 2024 · Dec 19, 2024 · Dec 20, 2024
diff --git a/scripts/eurostat/health_determinants/alcohol_consumption/README.md b/scripts/eurostat/health_determinants/alcohol_consumption/README.md
@@ -18,18 +18,6 @@ The population is categorized by various set of combinations as below:
         10. Alcohol Consumption by Sex and Country of Birth.
         11. Alcohol Consumption by Sex and Country of Citizenship.
 
-
-### Download URL
-Input files are available for download from url: https://ec.europa.eu/eurostat/web/health/data/database -> Health -> Health determinants (hlth_det).
-
-### Import Procedure
-The below script will download the data and extract it.
-
-`python scripts/eurostat/health_determinants/common/download_eurostat_input_files.py --import_name alcohol_consumption`
-
-Files are created inside 'input_files' directory.
-
-
 #### Output
 Statistical variables for alcohol consumption are based on below properties available in input files.
 | Attribute                                     | Description                                                   	|
@@ -45,11 +33,6 @@ Statistical variables for alcohol consumption are based on below properties avai
 | Country of Citizenship                	| The citizenship of the population.                			|
 
 
-Below script will generate cleansed observation file (csv), mcf and tmcf files.
-
-`python scripts/eurostat/health_determinants/alcohol_consumption/process.py`
-
-
 #### Cleaned Observation File
 Cleaned data will be persisted as a CSV file in output/eurostat_population_alcohol_consumption.csv with the following columns.
 
@@ -65,9 +48,31 @@ MCF and tMCF files are presisted in below mentioned path.
 - [output/eurostat_population_alcohol_consumption.mcf]
 - [output/eurostat_population_alcohol_consumption.tmcf]
 
+### Download URL
+
+The data in tsv.gz formats are downloadable from https://ec.europa.eu/eurostat/web/main/data/database -> Data navigation tree -> Detailed datasets -> Population and social conditions -> Health -> Health determinants (hlth_det).
+The actual URLs are listed in import_download_details.py
 
 ### Running Tests
 
 Run the test cases
 
 `python3 -m unittest discover -v -s scripts/eurostat/health_determinants/alcohol_consumption/ -p process_test.py`
+
+### Import Procedure
+
+The below script will download the data, clean the data, Also generate final csv, mcf and tmcf files.
+
+`python scripts/eurostat/health_determinants/alcohol_consumption/process.py`
+
+if we want to perform only the download of this import, execute the below command:
+
+`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=download`
+
+if we want to perform only process for this import, execute the below command:
+
+`python scripts/eurostat/health_determinants/alcohol_consumption/process.py --mode=process`
+
+Downloaded Files are created inside 'input_files' directory.
+
+
diff --git a/scripts/eurostat/health_determinants/alcohol_consumption/manifest.json b/scripts/eurostat/health_determinants/alcohol_consumption/manifest.json
@@ -0,0 +1,19 @@
+{
+  "import_specifications": [
+    {
+      "import_name": "EuroStatHealth_AlcoholConsumption",
+      "curator_emails": ["sudhisha@google.com"],
+      "provenance_url": "https://ec.europa.eu/eurostat/web/main/data/database",
+      "provenance_description": "European Union (EU) Eurostat",
+      "scripts": ["process.py"],
+      "import_inputs": [
+        {
+          "template_mcf": "eurostat_population_alcoholconsumption.tmcf",
+          "cleaned_csv": "eurostat_population_alcoholconsumption.csv"
+        }
+      ],
+      "cron_schedule": "0 2 2 * *"
+    }
+  ]
+}
+
diff --git a/scripts/eurostat/health_determinants/alcohol_consumption/process.py b/scripts/eurostat/health_determinants/alcohol_consumption/process.py
@@ -18,20 +18,24 @@
 import os
 import sys
 import pandas as pd
+from absl import app, flags, logging
 
 _COMMON_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 sys.path.insert(1, _COMMON_PATH)
 # pylint: disable=wrong-import-position
 from common.euro_stat import EuroStat
+from common import import_download_details, download
 # pylint: enable=wrong-import-position
 
+_FLAGS = flags.FLAGS
+flags.DEFINE_string('mode', '', 'Options: download or process')
+
 
 class EuroStatAlcoholConsumption(EuroStat):
     """
     This Class has requried methods to generate Cleaned CSV,
     MCF and TMCF Files.
     """
-    _import_name = "alcohol_consumption"
 
     _mcf_template = ("Node: dcid:{sv}"
                      "\n{sv_name}"
@@ -88,6 +92,30 @@ class EuroStatAlcoholConsumption(EuroStat):
         "NotACitizen": "citizenship",
     }
 
+    @staticmethod
+    def download_data(import_name):
+        """Downloads raw data from Eurostat website and stores it in instance data frame.
+
+            Args:
+            import_name(str): A string representing the import name.
+
+            Returns:True
+
+        """
+        download_details = import_download_details.download_details[import_name]
+        download_path = os.path.abspath(
+            os.path.join(os.path.dirname(__file__), '..', import_name,
+                         "input_files"))
+        os.makedirs(download_path, exist_ok=True)
+
+        for file in download_details["filenames"]:
+            download_files_urls = [
+                download_details["input_url"] + str(file) +
+                download_details["file_extension"]
+            ]
+            download.download_files(download_files_urls, download_path)
+        return True
+
     # over-ridden parent abstract method
     def _property_correction(self):
         for k, v in self._sv_properties.items():
@@ -119,26 +147,41 @@ def _rename_frequency_column(self, df: pd.DataFrame) -> pd.DataFrame:
         return df.rename(columns={'frequenc': 'frequenc_alcohol'})
 
 
-if __name__ == '__main__':
-    input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                              "input_files")
-    ip_files = os.listdir(input_path)
-    ip_files = [input_path + os.sep + file for file in ip_files]
+def main(_):
+    mode = _FLAGS.mode
+    global import_name
+    import_name = "alcohol_consumption"
+    if mode == "" or mode == "download":
+        EuroStatAlcoholConsumption.download_data(import_name)
+    if mode == "" or mode == "process":
+        try:
+            input_path = os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "input_files")
+            ip_files = os.listdir(input_path)
+            ip_files = [input_path + os.sep + file for file in ip_files]
+
+            # Defining Output Files
+            data_file_path = os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "output")
+
+            csv_name = "eurostat_population_alcoholconsumption.csv"
+            mcf_name = "eurostat_population_alcoholconsumption.mcf"
+            tmcf_name = "eurostat_population_alcoholconsumption.tmcf"
 
-    # Defining Output Files
-    data_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                                  "output")
+            cleaned_csv_path = os.path.join(data_file_path, csv_name)
+            mcf_path = os.path.join(data_file_path, mcf_name)
+            tmcf_path = os.path.join(data_file_path, tmcf_name)
 
-    csv_name = "eurostat_population_alcoholconsumption.csv"
-    mcf_name = "eurostat_population_alcoholconsumption.mcf"
-    tmcf_name = "eurostat_population_alcoholconsumption.tmcf"
+            loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path,
+                                                mcf_path, tmcf_path,
+                                                import_name)
+            loader.generate_csv()
+            loader.generate_mcf()
+            loader.generate_tmcf()
+            print("Processing completed!")
+        except Exception as e:
+            logging.fatal(f'Download error')
 
-    cleaned_csv_path = os.path.join(data_file_path, csv_name)
-    mcf_path = os.path.join(data_file_path, mcf_name)
-    tmcf_path = os.path.join(data_file_path, tmcf_name)
 
-    loader = EuroStatAlcoholConsumption(ip_files, cleaned_csv_path, mcf_path,
-                                        tmcf_path)
-    loader.generate_csv()
-    loader.generate_mcf()
-    loader.generate_tmcf()
+if __name__ == "__main__":
+    app.run(main)
diff --git a/scripts/eurostat/health_determinants/common/download.py b/scripts/eurostat/health_determinants/common/download.py
@@ -17,6 +17,7 @@
 """
 import gzip
 import urllib.request
+from absl import logging
 
 
 def download_gz_file(download_file_url: str, download_path: str) -> None:
@@ -25,7 +26,7 @@ def download_gz_file(download_file_url: str, download_path: str) -> None:
 
     Args:
         download_file_url (str): url of the file to be downloaded as a string
-        download_path (str): local directory to dlownload the file
+        download_path (str): local directory to download the file
 
     Returns:
         None
@@ -49,19 +50,24 @@ def download_files(download_files_url: list, download_path: str) -> None:
 
     Args:
         download_file_url (str): url of the file to be downloaded as a string
-        download_path (str): local directory to dlownload the file
+        download_path (str): local directory to download the file
 
     Returns:
         None
     """
-    for download_file_url in download_files_url:
-        file_extension = download_file_url.split(".")[-1]
+    try:
+        for download_file_url in download_files_url:
+            file_extension = download_file_url.split(".")[-1]
 
-        if file_extension == "gz":
-            download_gz_file(download_file_url, download_path)
-        elif file_extension == "txt":
-            download_gz_file(download_file_url, download_path)
-        elif file_extension == "csv":
-            download_gz_file(download_file_url, download_path)
-        elif file_extension == "pdf":
-            download_gz_file(download_file_url, download_path)
+            if file_extension == "gz":
+                download_gz_file(download_file_url, download_path)
+            elif file_extension == "txt":
+                download_gz_file(download_file_url, download_path)
+            elif file_extension == "csv":
+                download_gz_file(download_file_url, download_path)
+            elif file_extension == "pdf":
+                download_gz_file(download_file_url, download_path)
+            else:
+                download_gz_file(download_file_url, download_path)
+    except Exception as e:
+        logging.fatal(f'Download Error: {e}')