Skip to content

Commit

Permalink
Merge pull request #6 from Daethyra/v3.2-conversion_extensions
Browse files Browse the repository at this point in the history
V3.2 conversion extensions
  • Loading branch information
Daethyra authored May 6, 2023
2 parents bfbd643 + 0922a41 commit fb2c3fc
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 37 deletions.
1 change: 0 additions & 1 deletion credits.txt

This file was deleted.

46 changes: 46 additions & 0 deletions extensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import pandas as pd
import pypandoc
import img2pdf
from pdf2image.pdf2image import convert_from_path
import fitz


class Converter:
def __init__(self, input_file):
self.input_file = input_file
self.input_extension = os.path.splitext(input_file)[1].lower()

def to_pdf(self, output_file):
if self.input_extension in ['.pdf', '.PDF']:
shutil.copy(self.input_file, output_file)
elif self.input_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
with open(output_file, "wb") as pdf_file:
pdf_bytes = img2pdf.convert(self.input_file)
if pdf_bytes:
pdf_file.write(pdf_bytes)
else:
raise ValueError("Empty output")
else:
pypandoc.convert_file(self.input_file, 'pdf', outputfile=output_file, extra_args=['--pdf-engine', 'pdflatex', '--quiet'])

def to_json(self, output_file):
if self.input_extension in ['.html', '.htm']:
df = pd.read_html(self.input_file)[0]
df.to_json(output_file, orient='records')
else:
raise ValueError("Conversion to JSON is not supported for this file type")

def to_csv(self, output_file):
if self.input_extension in ['.html', '.htm']:
df = pd.read_html(self.input_file)[0]
df.to_csv(output_file, index=False)
else:
raise ValueError("Conversion to CSV is not supported for this file type")

def to_yaml(self, output_file):
if self.input_extension in ['.html', '.htm']:
df = pd.read_html(self.input_file)[0]
df.to_csv(output_file, index=False)
else:
raise ValueError("Conversion to YAML is not supported for this file type")
59 changes: 24 additions & 35 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
import os
import sys
import shutil
import pypandoc
import img2pdf
from pdf2image.pdf2image import convert_from_path
import fitz
import requests
from urllib.parse import urlparse

pdflatex_path = shutil.which("pdflatex")
if not pdflatex_path:
print("pdflatex not found. Please make sure it is installed and available in the system's PATH variable.")
sys.exit(1)
from extensions import Converter

def is_url(input_path):
try:
Expand All @@ -27,34 +19,17 @@ def download_file(url, local_path):
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)

def convert_to_pdf(input_file, input_extension, output_file):
print(f"Processing {input_file}...")

try:
if input_extension in ['.pdf', '.PDF']:
shutil.copy(input_file, output_file)
elif input_extension.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
with open(output_file, "wb") as pdf_file:
pdf_bytes = img2pdf.convert(input_file)
if pdf_bytes:
pdf_file.write(pdf_bytes)
else:
raise ValueError("Empty output")
else:
output = pypandoc.convert_file(input_file, 'pdf', outputfile=output_file, extra_args=['--pdf-engine', pdflatex_path, '--quiet'])
except Exception as e:
print(f"Error converting {input_file} to PDF: {str(e)}")
return

print(f"Converted {input_file} to {output_file}")

if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python main.py <input_file>")
if len(sys.argv) != 3:
print("Usage: python main.py <input_file> <output_format>")
sys.exit(1)

input_path = sys.argv[1]
output_format = sys.argv[2].lower()

if not output_format.startswith('.'):
output_format = '.' + output_format

if is_url(input_path):
url_filename = os.path.basename(urlparse(input_path).path)
input_file = f"temp_{url_filename}"
Expand All @@ -66,10 +41,24 @@ def convert_to_pdf(input_file, input_extension, output_file):
print(f"Input file '{input_file}' does not exist.")
sys.exit(1)

input_filename, input_extension = os.path.splitext(input_file)
output_file = f"{input_filename}_output.pdf"
input_filename = os.path.splitext(input_file)[0]
output_file = f"{input_filename}_output{output_format}"

converter = Converter(input_file)

if output_format == '.pdf':
converter.to_pdf(output_file)
elif output_format == '.json':
converter.to_json(output_file)
elif output_format == '.csv':
converter.to_csv(output_file)
elif output_format == '.yaml':
converter.to_yaml(output_file)
else:
print(f"Unsupported output format: {output_format}")
sys.exit(1)

convert_to_pdf(input_file, input_extension, output_file)
print(f"Converted {input_file} to {output_file}")

if is_url(input_path):
os.remove(input_file) # Clean up the temporary file
4 changes: 3 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ A Python-based console application to convert various file formats, such as HTML
3. Install the required dependencies by running: ```pip install -r requirements.txt```

# Usage
To use the script, navigate to the directory containing the source code and run the following command:
To use the script, first replace the {user} in `main.py` with your username, or otherwise point the program to the correct path.

Then, navigate to the directory containing the source code and run the following command:
```python main.py <input_file>```

Replace <input_file> with the path to the file you want to convert, or the URL of the file.
Expand Down
55 changes: 55 additions & 0 deletions utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
import sys
import logging
from concurrent.futures import ThreadPoolExecutor
from main import input_file, output_format
from extensions import Converter

logging.basicConfig(filename='converter.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def process_file(file_path, output_format):
try:
input_file = os.path.abspath(file_path)
input_filename = os.path.splitext(input_file)[0]
output_file = f"{input_filename}_output{output_format}"
converter = Converter(input_file)
if output_format == '.pdf':
converter.to_pdf(output_file)
elif output_format == '.json':
converter.to_json(output_file)
elif output_format == '.csv':
converter.to_csv(output_file)
elif output_format == '.yaml':
converter.to_yaml(output_file)
else:
raise ValueError(f"Unsupported output format: {output_format}")
logging.info(f"Successfully converted {input_file} to {output_file}")
except Exception as e:
logging.error(f"Error converting {input_file}: {str(e)}") # type: ignore


def batch_process(directory_path, output_format, max_workers=4):
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for root, _, files in os.walk(directory_path):
for file in files:
file_path = os.path.join(root, file)
executor.submit(process_file, file_path, output_format)


if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python utility.py <directory_path> <output_format>")
sys.exit(1)

directory_path = sys.argv[1]
output_format = sys.argv[2].lower()

if not output_format.startswith('.'):
output_format = '.' + output_format

if not os.path.exists(directory_path) or not os.path.isdir(directory_path):
print(f"Directory '{directory_path}' does not exist.")
sys.exit(1)

batch_process(directory_path, output_format)

0 comments on commit fb2c3fc

Please sign in to comment.