Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add files via upload #5

Merged
merged 1 commit into from
May 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,322 changes: 661 additions & 661 deletions LICENSE

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions credits.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/learning-zone/website-templates/tree/master/coffee-shop-free-html5-template
149 changes: 55 additions & 94 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,111 +1,72 @@
import sys
import os
import sys
import shutil
import pypandoc
import img2pdf
from pdf2image.pdf2image import convert_from_path
import fitz
from pdf2image import convert_from_path, convert_from_bytes # type: ignore
from io import BytesIO
from PIL import Image
import html2text
import docx2txt
import base64
import requests
from urllib.parse import urlparse

def get_content(input_file, input_extension):
if input_extension in ['.pdf', '.PDF']:
with open(input_file, 'rb') as f:
content = f.read()
else:
content = None
return content
pdflatex_path = r"C:\Users\dae\AppData\Local\Programs\MiKTeX\miktex\bin\x64\pdflatex.exe"

def is_url(input_path):
try:
result = urlparse(input_path)
return all([result.scheme, result.netloc])
except ValueError:
return False

def download_file(url, local_path):
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)

def convert_to_pdf(input_file, input_extension, output_file):
content = get_content(input_file, input_extension)

if content is None:
print("Unable to read file content.")
sys.exit()
print(f"Processing {input_file}...")

if input_extension in ['.pdf', '.PDF']:
with open(output_file, 'wb') as f:
f.write(content)
elif input_extension in ['.jpeg', '.jpg', '.JPEG', '.JPG']:
images = convert_from_bytes(content)
images[0].save(output_file, 'PDF', resolution=100.0, save_all=True, append_images=images[1:])
elif input_extension in ['.docx', '.DOCX']:
doc_text = docx2txt.process(input_file)
with fitz.open() as pdf_doc: # type: ignore
pdf_doc.new_page()
page = pdf_doc[-1]
page.insert_textbox(fitz.Rect(72, 72, 522, 720), doc_text)
pdf_doc.save(output_file)
else:
print("Unsupported file type.")
sys.exit()
try:
if input_extension in ['.pdf', '.PDF']:
shutil.copy(input_file, output_file)
elif input_extension.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
with open(output_file, "wb") as pdf_file:
pdf_bytes = img2pdf.convert(input_file)
if pdf_bytes:
pdf_file.write(pdf_bytes)
else:
raise ValueError("Empty output")
else:
output = pypandoc.convert_file(input_file, 'pdf', outputfile=output_file, extra_args=['--pdf-engine', pdflatex_path, '--quiet'])
except Exception as e:
print(f"Error converting {input_file} to PDF: {str(e)}")
return

def convert_to_jpeg(input_file, input_extension, output_folder):
content = get_content(input_file, input_extension)

if content is None:
print("Unable to read file content.")
sys.exit()
print(f"Converted {input_file} to {output_file}")

if input_extension in ['.pdf', '.PDF']:
images = convert_from_bytes(content)
for i, image in enumerate(images):
image.save(os.path.join(output_folder, f'page_{i+1}.jpg'), 'JPEG')
else:
print("Unsupported file type.")
sys.exit()
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python main.py <input_file>")
sys.exit(1)

def convert_to_html(input_file, input_extension, output_file):
content = get_content(input_file, input_extension)
input_path = sys.argv[1]

if content is None:
print("Unable to read file content.")
sys.exit()

if input_extension in ['.pdf', '.PDF']:
images = convert_from_bytes(content)
with open(output_file, 'w') as f:
f.write('<!DOCTYPE html><html><head><title>PDF to HTML</title></head><body>')
for i, image in enumerate(images):
buffer = BytesIO()
image.save(buffer, 'JPEG')
img_base64 = base64.b64encode(buffer.getvalue()).decode()
f.write(f'<img src="data:image/jpeg;base64,{img_base64}" alt="Page {i+1}" width="100%" /><br><br>')
f.write('</body></html>')
elif input_extension in ['.docx', '.DOCX']:
doc_text = docx2txt.process(input_file)
html_text = html2text.html2text(doc_text)
with open(output_file, 'w') as f:
f.write(html_text)
if is_url(input_path):
url_filename = os.path.basename(urlparse(input_path).path)
input_file = f"temp_{url_filename}"
download_file(input_path, input_file)
else:
print("Unsupported file type.")
sys.exit()

if __name__ == '__main__':
if len(sys.argv) < 4:
print("Usage: python main.py <input_file> <output_file> <conversion_type>")
sys.exit()

input_file = sys.argv[1]
output_file = sys.argv[2]
conversion_type = sys.argv[3]
input_file = input_path

if not os.path.exists(input_file):
print(f"Input file '{input_file}' does not exist.")
sys.exit()
sys.exit(1)

input_extension = os.path.splitext(input_file)[1]
input_filename, input_extension = os.path.splitext(input_file)
output_file = f"{input_filename}_output.pdf"

if conversion_type == 'pdf':
convert_to_pdf(input_file, input_extension, output_file)
elif conversion_type == 'jpeg':
output_folder = output_file
if not os.path.exists(output_folder):
os.makedirs(output_folder)
convert_to_jpeg(input_file, input_extension, output_folder)
elif conversion_type == 'html':
convert_to_html(input_file, input_extension, output_file)
else:
print(f"Unsupported conversion type: {conversion_type}")
sys.exit()
convert_to_pdf(input_file, input_extension, output_file)

if is_url(input_path):
os.remove(input_file) # Clean up the temporary file
13 changes: 13 additions & 0 deletions mfe_parse.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
2023-05-03 20:14:30,457 - INFO - Processing .\sample_documents\test.html...
2023-05-03 20:14:33,649 - INFO - Converted .\sample_documents\test.html to .\sample_documents\test_output.pdf
2023-05-03 20:14:38,454 - INFO - Processing .\sample_documents\test.odt...
2023-05-03 20:14:41,332 - INFO - Converted .\sample_documents\test.odt to .\sample_documents\test_output.pdf
2023-05-03 20:15:23,064 - ERROR - Input file '.\sample_documents\test.jpg' does not exist.
2023-05-03 20:15:41,300 - INFO - Processing .\sample_documents\photomode_03122021_224226.png...
2023-05-03 20:15:41,380 - WARNING - Image contains an alpha channel. Computing a separate soft mask (/SMask) image to store transparency in PDF.
2023-05-03 20:15:42,388 - INFO - Converted .\sample_documents\photomode_03122021_224226.png to .\sample_documents\photomode_03122021_224226_output.pdf
2023-05-03 20:16:31,434 - INFO - Processing .\sample_documents\photomode_03122021_224226.png...
2023-05-03 20:16:31,520 - WARNING - Image contains an alpha channel. Computing a separate soft mask (/SMask) image to store transparency in PDF.
2023-05-03 20:16:32,522 - INFO - Converted .\sample_documents\photomode_03122021_224226.png to .\sample_documents\photomode_03122021_224226_output.pdf
2023-05-03 20:16:52,102 - INFO - Processing .\sample_documents\example.txt...
2023-05-03 20:16:52,235 - ERROR - Error converting .\sample_documents\example.txt to PDF: Invalid input format! Got "txt" but expected one of these: biblatex, bibtex, commonmark, commonmark_x, creole, csljson, csv, docbook, docx, dokuwiki, endnotexml, epub, fb2, gfm, haddock, html, ipynb, jats, jira, json, latex, man, markdown, markdown_github, markdown_mmd, markdown_phpextra, markdown_strict, mediawiki, muse, native, odt, opml, org, ris, rst, rtf, t2t, textile, tikiwiki, tsv, twiki, vimwiki
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
PyMuPDF
img2pdf
pdf2image
html2text
docx2txt
pillow
fitz
pypandoc
requests
Binary file added sample_documents/blackbuck.bmp
Binary file not shown.
45 changes: 45 additions & 0 deletions sample_documents/example.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
March 14th, 2023
CNN PRIMETIME: FLIGHT RISK – TURBULENT TIMES FOR AIR TRAVEL
HOSTED BY CNN ANCHOR KATE BOLDUAN

THURSDAY, MARCH 16 AT 9PM ET ON CNN AND CNN INTERNATIONAL



Following the news events of near plane collisions and dangerous turbulence, CNN will air a one-hour special taking an in-depth look at the recent troubles in America’s aviation industry on Thursday, March 16 at 9pm ET.

CNN anchor Kate Bolduan, along with CNN correspondents Omar Jimenez and Pete Muntean, will give viewers an inside and simulated perspective of what pilots and air traffic controllers, flight attendants and passengers have to contend with during such emergencies. Captain Chesley Sullenberger will be among the pilots, flight attendants and passengers weighing in on the path forward.

CNN Primetime: Flight Risk: Turbulent Times for Air Travel, will stream live for pay TV subscribers via CNN.com and CNN OTT, and mobile apps under “TV Channels,” or CNNgo where available. CNN Primetime is also available On Demand beginning March 17 to pay TV subscribers via CNN.com, CNN apps, and Cable Operator Platforms.
###

About CNN Worldwide
CNN Worldwide is the most honored brand in cable news, reaching more individuals on television and online than any other cable news organization in the United States. Globally, CNN International is the most widely distributed news channel. CNN Digital is the #1 online news destination, with more unique visitors and video viewers than any other news source. CNN’s award-winning portfolio also includes CNN Original Series, which develops non-scripted programming for television via commissioned projects, acquisitions and in-house production. CNN Films produces, commissions and acquires documentary feature and short films for theatrical and festival exhibition, as well as for broadcast and other distribution across CNN’s multiple platforms. Additionally, CNN Newsource is the world’s most extensively utilized news service partnering with over 1,000 local and international news organizations around the world. CNN is a division of Warner Bros. Discovery.


Examples of 'extensive' in a sentence
Go to the dictionary page of extensive
Examples from Collins dictionaries
When built, the palace and its grounds were more extensive than the city itself.
There was extensive coverage of World Book Day on the BBC.
The facilities available are very extensive.
The blast caused extensive damage, shattering the ground-floor windows.
The security forces have extensive powers of search and arrest.
Mr Marr makes extensive use of exclusively Scottish words.


“A Dog’s Tale” by Mark Twain
A dog recounts her history. Her mother was a collie that liked making a show of her education, which was superficial. She liked using words and phrases she had heard without regard for the meaning. She was also kind, gentle and brave. When the narrator grows up, she’s sold, which is very sad for them both. She ends up in a fine home. One day, a situation arises that tests her character.

Read “A Dog’s Tale”

“The Voice in the Night” by William Hope Hodgson
A schooner is approached by a small rowboat. The passenger doesn’t want any lanterns out or any direct contact with the crew. He only asks for some food to be floated out to him. He leaves after getting it, but returns soon after.
Read “The Voice in the Night”

“The Verdict” by Edith Wharton
At the height of his career as a painter, Jack Gisburn married a rich widow, moved to the Riviera and quit painting. The women whom he painted mourned his sudden departure—men and his fellow artists less so. On a trip to the Riviera three years later, it occurs to the narrator that he could check in on Gisburn and perhaps find out what happened.

Read “The Verdict”


Binary file added sample_documents/locallabtester.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added sample_documents/photomode_03122021_224226.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading