-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclippings_parser.py
83 lines (68 loc) · 2.8 KB
/
clippings_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
def parse_clippings(file_path):
"""
Parses a Kindle clippings file and returns the data as a pandas DataFrame.
Args:
file_path (str): The path to the Kindle clippings file.
Returns:
pandas.DataFrame: A DataFrame containing the parsed clippings with columns:
- "Book Title": The title of the book.
- "Metadata": The metadata associated with the clipping.
- "Added On": The date and time when the clipping was added.
- "Text": The content of the clipping.
- "Category": The category of the clipping (extracted from metadata).
- "Page": The page number of the clipping (extracted from metadata).
- "Position": The position of the clipping (extracted from metadata).
"""
try:
import pandas as pd
except:
print("[TODO]: Remove pandas.")
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
# Regular expression to capture the clippings
pattern = r"""^(.*?)\n # Título del libro
-\s(.*?)\s\|\sAñadido\s(el\s.*?)\n # Metadatos
(.*?)\n=+$""" # Contenido y separador
# Compile the expression to allow comments and spaces
regex = re.compile(pattern, re.MULTILINE | re.DOTALL | re.VERBOSE)
matches = regex.findall(content)
# Process matches into a structured list
clippings = []
for match in matches:
book_title = match[0].strip()
metadata = match[1].strip()
added_on = match[2].strip()
text = match[3].strip()
clippings.append({
"Book Title": book_title,
"Metadata": metadata,
"Added On": added_on,
"Text": text
})
# Convert the list to a DataFrame
df = pd.DataFrame(clippings)
# Extract categories, pages, and positions
if not df.empty:
df["Category"] = df["Metadata"].apply(extract_category)
df["Page"] = df["Metadata"].apply(extract_page)
df["Position"] = df["Metadata"].apply(extract_position)
# df.to_csv("kindle_bookmarks.csv")
return df
def extract_category(metadata):
"""Extracts the category: Note, Bookmark, or Highlight"""
if "nota" in metadata.lower():
return "Note"
elif "marcador" in metadata.lower():
return "Bookmark"
elif "subrayado" in metadata.lower():
return "Highlight"
return None
def extract_page(metadata):
"""Extracts the page from the metadata if available"""
match = re.search(r"página (\d+)", metadata, re.IGNORECASE)
return match.group(1) if match else None
def extract_position(metadata):
"""Extracts the position from the metadata"""
match = re.search(r"posición ([\d\-]+)", metadata, re.IGNORECASE)
return match.group(1) if match else None