From 9effa26dc9c3a7ed8d0a968e0c05c12c0fddecc8 Mon Sep 17 00:00:00 2001 From: cyberin0 Date: Thu, 4 May 2023 15:37:18 -0700 Subject: [PATCH 1/2] Commit via VSCode CLI. --- credits.txt | 1 - readme.md | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) delete mode 100644 credits.txt diff --git a/credits.txt b/credits.txt deleted file mode 100644 index ce5ff97..0000000 --- a/credits.txt +++ /dev/null @@ -1 +0,0 @@ -https://github.com/learning-zone/website-templates/tree/master/coffee-shop-free-html5-template \ No newline at end of file diff --git a/readme.md b/readme.md index c24ea3c..7e8c1aa 100644 --- a/readme.md +++ b/readme.md @@ -17,7 +17,9 @@ A Python-based console application to convert various file formats, such as HTML 3. Install the required dependencies by running: ```pip install -r requirements.txt``` # Usage -To use the script, navigate to the directory containing the source code and run the following command: +To use the script, first replace the {user} in `main.py` with your username, or otherwise point the program to the correct path. + +Then, navigate to the directory containing the source code and run the following command: ```python main.py ``` Replace with the path to the file you want to convert, or the URL of the file. From 482938172e626c0dfe15cb604807e707fada2f9b Mon Sep 17 00:00:00 2001 From: Daethyra <109057945+Daethyra@users.noreply.github.com> Date: Fri, 5 May 2023 19:43:57 -0700 Subject: [PATCH 2/2] new file: extensions.py modified: main.py new file: utility.py --- extensions.py | 46 ++++++++++++++++++++++++++++++++++++++++++ main.py | 56 ++++++++++++++++++++++----------------------------- utility.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 32 deletions(-) create mode 100644 extensions.py create mode 100644 utility.py diff --git a/extensions.py b/extensions.py new file mode 100644 index 0000000..4dd0826 --- /dev/null +++ b/extensions.py @@ -0,0 +1,46 @@ +import os +import pandas as pd +import pypandoc +import img2pdf +from pdf2image.pdf2image import convert_from_path +import fitz + + +class Converter: + def __init__(self, input_file): + self.input_file = input_file + self.input_extension = os.path.splitext(input_file)[1].lower() + + def to_pdf(self, output_file): + if self.input_extension in ['.pdf', '.PDF']: + shutil.copy(self.input_file, output_file) + elif self.input_extension in ['.jpg', '.jpeg', '.png', '.bmp']: + with open(output_file, "wb") as pdf_file: + pdf_bytes = img2pdf.convert(self.input_file) + if pdf_bytes: + pdf_file.write(pdf_bytes) + else: + raise ValueError("Empty output") + else: + pypandoc.convert_file(self.input_file, 'pdf', outputfile=output_file, extra_args=['--pdf-engine', 'pdflatex', '--quiet']) + + def to_json(self, output_file): + if self.input_extension in ['.html', '.htm']: + df = pd.read_html(self.input_file)[0] + df.to_json(output_file, orient='records') + else: + raise ValueError("Conversion to JSON is not supported for this file type") + + def to_csv(self, output_file): + if self.input_extension in ['.html', '.htm']: + df = pd.read_html(self.input_file)[0] + df.to_csv(output_file, index=False) + else: + raise ValueError("Conversion to CSV is not supported for this file type") + + def to_yaml(self, output_file): + if self.input_extension in ['.html', '.htm']: + df = pd.read_html(self.input_file)[0] + df.to_csv(output_file, index=False) + else: + raise ValueError("Conversion to YAML is not supported for this file type") diff --git a/main.py b/main.py index 36b3d2f..35bf246 100644 --- a/main.py +++ b/main.py @@ -1,14 +1,9 @@ import os import sys import shutil -import pypandoc -import img2pdf -from pdf2image.pdf2image import convert_from_path -import fitz import requests from urllib.parse import urlparse - -pdflatex_path = r"C:\Users\{user}\AppData\Local\Programs\MiKTeX\miktex\bin\x64\pdflatex.exe" +from extensions import Converter def is_url(input_path): try: @@ -24,34 +19,17 @@ def download_file(url, local_path): for chunk in r.iter_content(chunk_size=8192): f.write(chunk) -def convert_to_pdf(input_file, input_extension, output_file): - print(f"Processing {input_file}...") - - try: - if input_extension in ['.pdf', '.PDF']: - shutil.copy(input_file, output_file) - elif input_extension.lower() in ['.jpg', '.jpeg', '.png', '.bmp']: - with open(output_file, "wb") as pdf_file: - pdf_bytes = img2pdf.convert(input_file) - if pdf_bytes: - pdf_file.write(pdf_bytes) - else: - raise ValueError("Empty output") - else: - output = pypandoc.convert_file(input_file, 'pdf', outputfile=output_file, extra_args=['--pdf-engine', pdflatex_path, '--quiet']) - except Exception as e: - print(f"Error converting {input_file} to PDF: {str(e)}") - return - - print(f"Converted {input_file} to {output_file}") - if __name__ == "__main__": - if len(sys.argv) != 2: - print("Usage: python main.py ") + if len(sys.argv) != 3: + print("Usage: python main.py ") sys.exit(1) input_path = sys.argv[1] + output_format = sys.argv[2].lower() + if not output_format.startswith('.'): + output_format = '.' + output_format + if is_url(input_path): url_filename = os.path.basename(urlparse(input_path).path) input_file = f"temp_{url_filename}" @@ -63,10 +41,24 @@ def convert_to_pdf(input_file, input_extension, output_file): print(f"Input file '{input_file}' does not exist.") sys.exit(1) - input_filename, input_extension = os.path.splitext(input_file) - output_file = f"{input_filename}_output.pdf" + input_filename = os.path.splitext(input_file)[0] + output_file = f"{input_filename}_output{output_format}" + + converter = Converter(input_file) + + if output_format == '.pdf': + converter.to_pdf(output_file) + elif output_format == '.json': + converter.to_json(output_file) + elif output_format == '.csv': + converter.to_csv(output_file) + elif output_format == '.yaml': + converter.to_yaml(output_file) + else: + print(f"Unsupported output format: {output_format}") + sys.exit(1) - convert_to_pdf(input_file, input_extension, output_file) + print(f"Converted {input_file} to {output_file}") if is_url(input_path): os.remove(input_file) # Clean up the temporary file diff --git a/utility.py b/utility.py new file mode 100644 index 0000000..4fdffe9 --- /dev/null +++ b/utility.py @@ -0,0 +1,55 @@ +import os +import sys +import logging +from concurrent.futures import ThreadPoolExecutor +from main import input_file, output_format +from extensions import Converter + +logging.basicConfig(filename='converter.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + + +def process_file(file_path, output_format): + try: + input_file = os.path.abspath(file_path) + input_filename = os.path.splitext(input_file)[0] + output_file = f"{input_filename}_output{output_format}" + converter = Converter(input_file) + if output_format == '.pdf': + converter.to_pdf(output_file) + elif output_format == '.json': + converter.to_json(output_file) + elif output_format == '.csv': + converter.to_csv(output_file) + elif output_format == '.yaml': + converter.to_yaml(output_file) + else: + raise ValueError(f"Unsupported output format: {output_format}") + logging.info(f"Successfully converted {input_file} to {output_file}") + except Exception as e: + logging.error(f"Error converting {input_file}: {str(e)}") # type: ignore + + +def batch_process(directory_path, output_format, max_workers=4): + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for root, _, files in os.walk(directory_path): + for file in files: + file_path = os.path.join(root, file) + executor.submit(process_file, file_path, output_format) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python utility.py ") + sys.exit(1) + + directory_path = sys.argv[1] + output_format = sys.argv[2].lower() + + if not output_format.startswith('.'): + output_format = '.' + output_format + + if not os.path.exists(directory_path) or not os.path.isdir(directory_path): + print(f"Directory '{directory_path}' does not exist.") + sys.exit(1) + + batch_process(directory_path, output_format)