-
Notifications
You must be signed in to change notification settings - Fork 1
/
pdf_reader.py
43 lines (29 loc) · 1020 Bytes
/
pdf_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# extract_doc_info.py
from pathlib import Path
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text
def extract_information(pdf_path):
with open(pdf_path, 'rb') as f:
return PdfReader(f).metadata
def words_in_pdf(pdf_path):
text = extract_text(pdf_path)
# Split text by newlines
lines = text.splitlines()
# Remove empty lines
lines = [line for line in lines if line.strip()]
# Count the words in all the lines
words = [len(line.split()) for line in lines]
return sum(words)
if __name__ == '__main__':
pdfs_dir = Path('judgement_pdfs')
words = []
for pdf_path in pdfs_dir.glob('*.pdf'):
try:
pdf_words = words_in_pdf(pdf_path)
print(f"PDF: {pdf_path} Words: {pdf_words}")
words.append(pdf_words)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
# Plot a histogram of the words in each PDF
import matplotlib.pyplot as plt
plt.hist(words)