Daethyra · Daethyra · May 4, 2023 · May 4, 2023
diff --git a/LICENSE b/LICENSE
diff --git a/credits.txt b/credits.txt
@@ -0,0 +1 @@
+https://github.com/learning-zone/website-templates/tree/master/coffee-shop-free-html5-template
diff --git a/main.py b/main.py
@@ -1,111 +1,72 @@
-import sys
 import os
+import sys
+import shutil
+import pypandoc
+import img2pdf
+from pdf2image.pdf2image import convert_from_path
 import fitz
-from pdf2image import convert_from_path, convert_from_bytes # type: ignore
-from io import BytesIO
-from PIL import Image
-import html2text
-import docx2txt
-import base64
+import requests
+from urllib.parse import urlparse
 
-def get_content(input_file, input_extension):
-    if input_extension in ['.pdf', '.PDF']:
-        with open(input_file, 'rb') as f:
-            content = f.read()
-    else:
-        content = None
-    return content
+pdflatex_path = r"C:\Users\dae\AppData\Local\Programs\MiKTeX\miktex\bin\x64\pdflatex.exe"
+
+def is_url(input_path):
+    try:
+        result = urlparse(input_path)
+        return all([result.scheme, result.netloc])
+    except ValueError:
+        return False
+
+def download_file(url, local_path):
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(local_path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
 
 def convert_to_pdf(input_file, input_extension, output_file):
-    content = get_content(input_file, input_extension)
-
-    if content is None:
-        print("Unable to read file content.")
-        sys.exit()
+    print(f"Processing {input_file}...")
 
-    if input_extension in ['.pdf', '.PDF']:
-        with open(output_file, 'wb') as f:
-            f.write(content)
-    elif input_extension in ['.jpeg', '.jpg', '.JPEG', '.JPG']:
-        images = convert_from_bytes(content)
-        images[0].save(output_file, 'PDF', resolution=100.0, save_all=True, append_images=images[1:])
-    elif input_extension in ['.docx', '.DOCX']:
-        doc_text = docx2txt.process(input_file)
-        with fitz.open() as pdf_doc: # type: ignore
-            pdf_doc.new_page()
-            page = pdf_doc[-1]
-            page.insert_textbox(fitz.Rect(72, 72, 522, 720), doc_text)
-            pdf_doc.save(output_file)
-    else:
-        print("Unsupported file type.")
-        sys.exit()
+    try:
+        if input_extension in ['.pdf', '.PDF']:
+            shutil.copy(input_file, output_file)
+        elif input_extension.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
+            with open(output_file, "wb") as pdf_file:
+                pdf_bytes = img2pdf.convert(input_file)
+                if pdf_bytes:
+                    pdf_file.write(pdf_bytes)
+                else:
+                    raise ValueError("Empty output")
+        else:
+            output = pypandoc.convert_file(input_file, 'pdf', outputfile=output_file, extra_args=['--pdf-engine', pdflatex_path, '--quiet'])
+    except Exception as e:
+        print(f"Error converting {input_file} to PDF: {str(e)}")
+        return
 
-def convert_to_jpeg(input_file, input_extension, output_folder):
-    content = get_content(input_file, input_extension)
-
-    if content is None:
-        print("Unable to read file content.")
-        sys.exit()
+    print(f"Converted {input_file} to {output_file}")
 
-    if input_extension in ['.pdf', '.PDF']:
-        images = convert_from_bytes(content)
-        for i, image in enumerate(images):
-            image.save(os.path.join(output_folder, f'page_{i+1}.jpg'), 'JPEG')
-    else:
-        print("Unsupported file type.")
-        sys.exit()
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python main.py <input_file>")
+        sys.exit(1)
 
-def convert_to_html(input_file, input_extension, output_file):
-    content = get_content(input_file, input_extension)
+    input_path = sys.argv[1]
 
-    if content is None:
-        print("Unable to read file content.")
-        sys.exit()
-
-    if input_extension in ['.pdf', '.PDF']:
-        images = convert_from_bytes(content)
-        with open(output_file, 'w') as f:
-            f.write('<!DOCTYPE html><html><head><title>PDF to HTML</title></head><body>')
-            for i, image in enumerate(images):
-                buffer = BytesIO()
-                image.save(buffer, 'JPEG')
-                img_base64 = base64.b64encode(buffer.getvalue()).decode()
-                f.write(f'<img src="data:image/jpeg;base64,{img_base64}" alt="Page {i+1}" width="100%" /><br><br>')
-            f.write('</body></html>')
-    elif input_extension in ['.docx', '.DOCX']:
-        doc_text = docx2txt.process(input_file)
-        html_text = html2text.html2text(doc_text)
-        with open(output_file, 'w') as f:
-            f.write(html_text)
+    if is_url(input_path):
+        url_filename = os.path.basename(urlparse(input_path).path)
+        input_file = f"temp_{url_filename}"
+        download_file(input_path, input_file)
     else:
-        print("Unsupported file type.")
-        sys.exit()
-
-if __name__ == '__main__':
-    if len(sys.argv) < 4:
-        print("Usage: python main.py <input_file> <output_file> <conversion_type>")
-        sys.exit()
-
-    input_file = sys.argv[1]
-    output_file = sys.argv[2]
-    conversion_type = sys.argv[3]
+        input_file = input_path
 
     if not os.path.exists(input_file):
         print(f"Input file '{input_file}' does not exist.")
-        sys.exit()
+        sys.exit(1)
 
-    input_extension = os.path.splitext(input_file)[1]
+    input_filename, input_extension = os.path.splitext(input_file)
+    output_file = f"{input_filename}_output.pdf"
 
-    if conversion_type == 'pdf':
-        convert_to_pdf(input_file, input_extension, output_file)
-    elif conversion_type == 'jpeg':
-        output_folder = output_file
-        if not os.path.exists(output_folder):
-            os.makedirs(output_folder)
-        convert_to_jpeg(input_file, input_extension, output_folder)
-    elif conversion_type == 'html':
-        convert_to_html(input_file, input_extension, output_file)
-    else:
-        print(f"Unsupported conversion type: {conversion_type}")
-        sys.exit()
+    convert_to_pdf(input_file, input_extension, output_file)
 
+    if is_url(input_path):
+        os.remove(input_file)  # Clean up the temporary file
diff --git a/mfe_parse.log b/mfe_parse.log
@@ -0,0 +1,13 @@
+2023-05-03 20:14:30,457 - INFO - Processing .\sample_documents\test.html...
+2023-05-03 20:14:33,649 - INFO - Converted .\sample_documents\test.html to .\sample_documents\test_output.pdf
+2023-05-03 20:14:38,454 - INFO - Processing .\sample_documents\test.odt...
+2023-05-03 20:14:41,332 - INFO - Converted .\sample_documents\test.odt to .\sample_documents\test_output.pdf
+2023-05-03 20:15:23,064 - ERROR - Input file '.\sample_documents\test.jpg' does not exist.
+2023-05-03 20:15:41,300 - INFO - Processing .\sample_documents\photomode_03122021_224226.png...
+2023-05-03 20:15:41,380 - WARNING - Image contains an alpha channel. Computing a separate soft mask (/SMask) image to store transparency in PDF.
+2023-05-03 20:15:42,388 - INFO - Converted .\sample_documents\photomode_03122021_224226.png to .\sample_documents\photomode_03122021_224226_output.pdf
+2023-05-03 20:16:31,434 - INFO - Processing .\sample_documents\photomode_03122021_224226.png...
+2023-05-03 20:16:31,520 - WARNING - Image contains an alpha channel. Computing a separate soft mask (/SMask) image to store transparency in PDF.
+2023-05-03 20:16:32,522 - INFO - Converted .\sample_documents\photomode_03122021_224226.png to .\sample_documents\photomode_03122021_224226_output.pdf
+2023-05-03 20:16:52,102 - INFO - Processing .\sample_documents\example.txt...
+2023-05-03 20:16:52,235 - ERROR - Error converting .\sample_documents\example.txt to PDF: Invalid input format! Got "txt" but expected one of these: biblatex, bibtex, commonmark, commonmark_x, creole, csljson, csv, docbook, docx, dokuwiki, endnotexml, epub, fb2, gfm, haddock, html, ipynb, jats, jira, json, latex, man, markdown, markdown_github, markdown_mmd, markdown_phpextra, markdown_strict, mediawiki, muse, native, odt, opml, org, ris, rst, rtf, t2t, textile, tikiwiki, tsv, twiki, vimwiki
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
-PyMuPDF
+img2pdf
 pdf2image
-html2text
-docx2txt
-pillow
+fitz
+pypandoc
+requests
diff --git a/sample_documents/blackbuck.bmp b/sample_documents/blackbuck.bmp
diff --git a/sample_documents/example.txt b/sample_documents/example.txt
@@ -0,0 +1,45 @@
+March 14th, 2023
+CNN PRIMETIME: FLIGHT RISK – TURBULENT TIMES FOR AIR TRAVEL
+HOSTED BY CNN ANCHOR KATE BOLDUAN
+
+THURSDAY, MARCH 16 AT 9PM ET ON CNN AND CNN INTERNATIONAL
+
+
+
+Following the news events of near plane collisions and dangerous turbulence, CNN will air a one-hour special taking an in-depth look at the recent troubles in America’s aviation industry on Thursday, March 16 at 9pm ET.
+
+CNN anchor Kate Bolduan, along with CNN correspondents Omar Jimenez and Pete Muntean, will give viewers an inside and simulated perspective of what pilots and air traffic controllers, flight attendants and passengers have to contend with during such emergencies. Captain Chesley Sullenberger will be among the pilots, flight attendants and passengers weighing in on the path forward.
+
+CNN Primetime: Flight Risk: Turbulent Times for Air Travel, will stream live for pay TV subscribers via CNN.com and CNN OTT, and mobile apps under “TV Channels,” or CNNgo where available. CNN Primetime is also available On Demand beginning March 17 to pay TV subscribers via CNN.com, CNN apps, and Cable Operator Platforms.
+###
+
+About CNN Worldwide
+CNN Worldwide is the most honored brand in cable news, reaching more individuals on television and online than any other cable news organization in the United States. Globally, CNN International is the most widely distributed news channel. CNN Digital is the #1 online news destination, with more unique visitors and video viewers than any other news source. CNN’s award-winning portfolio also includes CNN Original Series, which develops non-scripted programming for television via commissioned projects, acquisitions and in-house production. CNN Films produces, commissions and acquires documentary feature and short films for theatrical and festival exhibition, as well as for broadcast and other distribution across CNN’s multiple platforms. Additionally, CNN Newsource is the world’s most extensively utilized news service partnering with over 1,000 local and international news organizations around the world. CNN is a division of Warner Bros. Discovery.
+
+
+Examples of 'extensive' in a sentence
+Go to the dictionary page of extensive
+Examples from Collins dictionaries
+When built, the palace and its grounds were more extensive than the city itself.
+There was extensive coverage of World Book Day on the BBC.
+The facilities available are very extensive.
+The blast caused extensive damage, shattering the ground-floor windows.
+The security forces have extensive powers of search and arrest.
+Mr Marr makes extensive use of exclusively Scottish words.
+
+
+“A Dog’s Tale” by Mark Twain
+A dog recounts her history. Her mother was a collie that liked making a show of her education, which was superficial. She liked using words and phrases she had heard without regard for the meaning. She was also kind, gentle and brave. When the narrator grows up, she’s sold, which is very sad for them both. She ends up in a fine home. One day, a situation arises that tests her character.
+
+Read “A Dog’s Tale”
+
+“The Voice in the Night” by William Hope Hodgson
+A schooner is approached by a small rowboat. The passenger doesn’t want any lanterns out or any direct contact with the crew. He only asks for some food to be floated out to him. He leaves after getting it, but returns soon after.
+Read “The Voice in the Night”
+
+“The Verdict” by Edith Wharton
+At the height of his career as a painter, Jack Gisburn married a rich widow, moved to the Riviera and quit painting. The women whom he painted mourned his sudden departure—men and his fellow artists less so. On a trip to the Riviera three years later, it occurs to the narrator that he could check in on Gisburn and perhaps find out what happened.
+
+Read “The Verdict”
+
+
diff --git a/sample_documents/locallabtester.jpg b/sample_documents/locallabtester.jpg
diff --git a/sample_documents/photomode_03122021_224226.png b/sample_documents/photomode_03122021_224226.png
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		https://github.com/learning-zone/website-templates/tree/master/coffee-shop-free-html5-template