Merge pull request #6 from Daethyra/v3.2-conversion_extensions

V3.2 conversion extensions
Daethyra · May 6, 2023 · fb2c3fc · fb2c3fc
2 parents bfbd643 + 0922a41
commit fb2c3fc
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 37 deletions.
diff --git a/credits.txt b/credits.txt
diff --git a/extensions.py b/extensions.py
@@ -0,0 +1,46 @@
+import os
+import pandas as pd
+import pypandoc
+import img2pdf
+from pdf2image.pdf2image import convert_from_path
+import fitz
+
+
+class Converter:
+    def __init__(self, input_file):
+        self.input_file = input_file
+        self.input_extension = os.path.splitext(input_file)[1].lower()
+
+    def to_pdf(self, output_file):
+        if self.input_extension in ['.pdf', '.PDF']:
+            shutil.copy(self.input_file, output_file)
+        elif self.input_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
+            with open(output_file, "wb") as pdf_file:
+                pdf_bytes = img2pdf.convert(self.input_file)
+                if pdf_bytes:
+                    pdf_file.write(pdf_bytes)
+                else:
+                    raise ValueError("Empty output")
+        else:
+            pypandoc.convert_file(self.input_file, 'pdf', outputfile=output_file, extra_args=['--pdf-engine', 'pdflatex', '--quiet'])
+
+    def to_json(self, output_file):
+        if self.input_extension in ['.html', '.htm']:
+            df = pd.read_html(self.input_file)[0]
+            df.to_json(output_file, orient='records')
+        else:
+            raise ValueError("Conversion to JSON is not supported for this file type")
+
+    def to_csv(self, output_file):
+        if self.input_extension in ['.html', '.htm']:
+            df = pd.read_html(self.input_file)[0]
+            df.to_csv(output_file, index=False)
+        else:
+            raise ValueError("Conversion to CSV is not supported for this file type")
+
+    def to_yaml(self, output_file):
+        if self.input_extension in ['.html', '.htm']:
+            df = pd.read_html(self.input_file)[0]
+            df.to_csv(output_file, index=False)
+        else:
+            raise ValueError("Conversion to YAML is not supported for this file type")
diff --git a/main.py b/main.py
@@ -1,17 +1,9 @@
 import os
 import sys
 import shutil
-import pypandoc
-import img2pdf
-from pdf2image.pdf2image import convert_from_path
-import fitz
 import requests
 from urllib.parse import urlparse
-
-pdflatex_path = shutil.which("pdflatex")
-if not pdflatex_path:
-    print("pdflatex not found. Please make sure it is installed and available in the system's PATH variable.")
-    sys.exit(1)
+from extensions import Converter
 
 def is_url(input_path):
     try:
@@ -27,34 +19,17 @@ def download_file(url, local_path):
             for chunk in r.iter_content(chunk_size=8192):
                 f.write(chunk)
 
-def convert_to_pdf(input_file, input_extension, output_file):
-    print(f"Processing {input_file}...")
-
-    try:
-        if input_extension in ['.pdf', '.PDF']:
-            shutil.copy(input_file, output_file)
-        elif input_extension.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
-            with open(output_file, "wb") as pdf_file:
-                pdf_bytes = img2pdf.convert(input_file)
-                if pdf_bytes:
-                    pdf_file.write(pdf_bytes)
-                else:
-                    raise ValueError("Empty output")
-        else:
-            output = pypandoc.convert_file(input_file, 'pdf', outputfile=output_file, extra_args=['--pdf-engine', pdflatex_path, '--quiet'])
-    except Exception as e:
-        print(f"Error converting {input_file} to PDF: {str(e)}")
-        return
-
-    print(f"Converted {input_file} to {output_file}")
-
 if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("Usage: python main.py <input_file>")
+    if len(sys.argv) != 3:
+        print("Usage: python main.py <input_file> <output_format>")
         sys.exit(1)
 
     input_path = sys.argv[1]
+    output_format = sys.argv[2].lower()
 
+    if not output_format.startswith('.'):
+        output_format = '.' + output_format
+
     if is_url(input_path):
         url_filename = os.path.basename(urlparse(input_path).path)
         input_file = f"temp_{url_filename}"
@@ -66,10 +41,24 @@ def convert_to_pdf(input_file, input_extension, output_file):
         print(f"Input file '{input_file}' does not exist.")
         sys.exit(1)
 
-    input_filename, input_extension = os.path.splitext(input_file)
-    output_file = f"{input_filename}_output.pdf"
+    input_filename = os.path.splitext(input_file)[0]
+    output_file = f"{input_filename}_output{output_format}"
+
+    converter = Converter(input_file)
+
+    if output_format == '.pdf':
+        converter.to_pdf(output_file)
+    elif output_format == '.json':
+        converter.to_json(output_file)
+    elif output_format == '.csv':
+        converter.to_csv(output_file)
+    elif output_format == '.yaml':
+        converter.to_yaml(output_file)
+    else:
+        print(f"Unsupported output format: {output_format}")
+        sys.exit(1)
 
-    convert_to_pdf(input_file, input_extension, output_file)
+    print(f"Converted {input_file} to {output_file}")
 
     if is_url(input_path):
         os.remove(input_file)  # Clean up the temporary file
diff --git a/readme.md b/readme.md
@@ -17,7 +17,9 @@ A Python-based console application to convert various file formats, such as HTML
 3. Install the required dependencies by running: ```pip install -r requirements.txt```
 
 # Usage 
-To use the script, navigate to the directory containing the source code and run the following command:
+To use the script, first replace the {user} in `main.py` with your username, or otherwise point the program to the correct path.
+
+Then, navigate to the directory containing the source code and run the following command:
 ```python main.py <input_file>```
 
 Replace <input_file> with the path to the file you want to convert, or the URL of the file.

diff --git a/utility.py b/utility.py
@@ -0,0 +1,55 @@
+import os
+import sys
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from main import input_file, output_format
+from extensions import Converter
+
+logging.basicConfig(filename='converter.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+
+def process_file(file_path, output_format):
+    try:
+        input_file = os.path.abspath(file_path)
+        input_filename = os.path.splitext(input_file)[0]
+        output_file = f"{input_filename}_output{output_format}"
+        converter = Converter(input_file)
+        if output_format == '.pdf':
+            converter.to_pdf(output_file)
+        elif output_format == '.json':
+            converter.to_json(output_file)
+        elif output_format == '.csv':
+            converter.to_csv(output_file)
+        elif output_format == '.yaml':
+            converter.to_yaml(output_file)
+        else:
+            raise ValueError(f"Unsupported output format: {output_format}")
+        logging.info(f"Successfully converted {input_file} to {output_file}")
+    except Exception as e:
+        logging.error(f"Error converting {input_file}: {str(e)}") # type: ignore
+
+
+def batch_process(directory_path, output_format, max_workers=4):
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        for root, _, files in os.walk(directory_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                executor.submit(process_file, file_path, output_format)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python utility.py <directory_path> <output_format>")
+        sys.exit(1)
+
+    directory_path = sys.argv[1]
+    output_format = sys.argv[2].lower()
+
+    if not output_format.startswith('.'):
+        output_format = '.' + output_format
+
+    if not os.path.exists(directory_path) or not os.path.isdir(directory_path):
+        print(f"Directory '{directory_path}' does not exist.")
+        sys.exit(1)
+
+    batch_process(directory_path, output_format)