Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Table of contents #61

Merged
merged 4 commits into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 288 additions & 0 deletions examples/Business Plan Writing 101 Wharton Entrepreneurship Series.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ dependencies = [
"faster-whisper>=1.0.3",
"ffmpeg-python>=0.2.0",
"instructor>=1.5.1",
"jinja2>=3.1.4",
"openai>=1.51.0",
"pathvalidate>=3.2.1",
"pydantic>=2.9.1",
"python-slugify>=8.0.4",
"torch>=2.4.1",
"tqdm>=4.66.5",
"typer-slim>=0.12.5",
Expand Down
6 changes: 6 additions & 0 deletions src/yt2doc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ def main(
"--timestamp-paragraphs",
help="Prepend timestamp to paragraphs",
),
add_table_of_contents: bool = typer.Option(
False,
"--add-table-of-contents",
help="Add table of contents at the beginning of the document",
),
skip_cache: typing.Annotated[
bool,
typer.Option("--skip-cache", help="If should skip reading from cache"),
Expand Down Expand Up @@ -151,6 +156,7 @@ def main(
segment_unchaptered=segment_unchaptered,
ignore_source_chapters=ignore_source_chapters,
to_timestamp_paragraphs=to_timestamp_paragraphs,
add_table_of_contents=add_table_of_contents,
llm_model=llm_model,
llm_server=llm_server,
llm_api_key=llm_api_key,
Expand Down
3 changes: 3 additions & 0 deletions src/yt2doc/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def get_yt2doc(
segment_unchaptered: bool,
ignore_source_chapters: bool,
to_timestamp_paragraphs: bool,
add_table_of_contents: bool,
llm_model: typing.Optional[str],
llm_server: str,
llm_api_key: str,
Expand Down Expand Up @@ -63,12 +64,14 @@ def get_yt2doc(
formatter = MarkdownFormatter(
paragraphs_segmenter=paragraphs_segmenter,
to_timestamp_paragraphs=to_timestamp_paragraphs,
add_table_of_contents=add_table_of_contents,
topic_segmenter=llm_topic_segmenter,
)
else:
formatter = MarkdownFormatter(
paragraphs_segmenter=paragraphs_segmenter,
to_timestamp_paragraphs=to_timestamp_paragraphs,
add_table_of_contents=add_table_of_contents,
)

media_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
Expand Down
138 changes: 89 additions & 49 deletions src/yt2doc/formatting/formatter.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,110 @@
import typing
import logging

import jinja2

from datetime import timedelta
from pathlib import Path

from pydantic import BaseModel
from slugify import slugify

from yt2doc.extraction import interfaces as extraction_interfaces
from yt2doc.formatting import interfaces

logger = logging.getLogger(__file__)


class ParagraphToRender(BaseModel):
start_h_m_s: str
text: str


class ChapterToRender(BaseModel):
title: str
custom_id: str
start_h_m_s: str
paragraphs: typing.Sequence[ParagraphToRender]


class MarkdownFormatter:
def __init__(
self,
paragraphs_segmenter: interfaces.IParagraphsSegmenter,
to_timestamp_paragraphs: bool,
add_table_of_contents: bool,
topic_segmenter: typing.Optional[interfaces.ITopicSegmenter] = None,
) -> None:
self.paragraphs_segmenter = paragraphs_segmenter
self.topic_segmenter = topic_segmenter
self.video_title_template = "# {name}"
self.chapter_title_template = "## {name}"
self.to_timestamp_paragraphs = to_timestamp_paragraphs
self.add_table_of_contents = add_table_of_contents

@staticmethod
def _paragraphs_to_text(
paragraphs: typing.Sequence[typing.Sequence[interfaces.Sentence]],
def _start_second_to_start_h_m_s(
start_second: float, webpage_url_domain: str, video_id: str
) -> str:
rounded_start_second = round(start_second)
start_h_m_s = str(timedelta(seconds=rounded_start_second))
if webpage_url_domain == "youtube.com":
return (
f"[{start_h_m_s}](https://youtu.be/{video_id}?t={rounded_start_second})"
)
return start_h_m_s

def _render(
self,
title: str,
chapters: typing.Sequence[interfaces.Chapter],
video_url: str,
video_id: str,
webpage_url_domain: str,
to_timestamp_paragraphs: bool,
) -> str:
paragraph_texts = []
for paragraph in paragraphs:
first_sentence = paragraph[0]
paragraph_text = "".join(sentence.text for sentence in paragraph)
paragraph_text = paragraph_text.strip()
if to_timestamp_paragraphs:
paragraph_start_second = round(first_sentence.start_second)
paragraph_start_h_m_s = str(timedelta(seconds=paragraph_start_second))
if webpage_url_domain == "youtube.com":
timestamp_prefix = f"[({paragraph_start_h_m_s})](https://youtu.be/{video_id}?t={paragraph_start_second})"
else:
timestamp_prefix = f"({paragraph_start_h_m_s})"
paragraph_text = f"{timestamp_prefix} {paragraph_text}"
paragraph_texts.append(paragraph_text)
return "\n\n".join(paragraph_texts)
chapters_to_render: typing.List[ChapterToRender] = []
for chapter in chapters:
if len(chapter.paragraphs) == 0:
continue

paragraphs_to_render = [
ParagraphToRender(
text=("".join([sentence.text for sentence in paragraph])).strip(),
start_h_m_s=self._start_second_to_start_h_m_s(
start_second=paragraph[0].start_second,
webpage_url_domain=webpage_url_domain,
video_id=video_id,
),
)
for paragraph in chapter.paragraphs
]
first_paragraph_to_render = paragraphs_to_render[0]
chapters_to_render.append(
ChapterToRender(
title=chapter.title,
custom_id=slugify(chapter.title),
start_h_m_s=first_paragraph_to_render.start_h_m_s,
paragraphs=paragraphs_to_render,
)
)

current_dir = Path(__file__).parent
jinja_environment = jinja2.Environment(
loader=jinja2.FileSystemLoader(current_dir)
)
template = jinja_environment.get_template("template.md")
rendered = template.render(
title=title,
chapters=[chapter.model_dump() for chapter in chapters_to_render],
video_url=video_url,
add_table_of_contents=self.add_table_of_contents,
to_timestamp_paragraphs=self.to_timestamp_paragraphs,
)
return rendered

def format_chaptered_transcript(
self, chaptered_transcript: extraction_interfaces.ChapteredTranscript
) -> interfaces.FormattedTranscript:
chapter_and_text_list: typing.List[typing.Tuple[str, str]] = []

if (
self.topic_segmenter is not None
and not chaptered_transcript.chaptered_at_source
Expand All @@ -62,42 +117,27 @@ def format_chaptered_transcript(
chapters = self.topic_segmenter.segment(
sentences_in_paragraphs=paragraphed_sentences
)
chapter_and_text_list = [
(
chapter.title,
self._paragraphs_to_text(
paragraphs=chapter.paragraphs,
video_id=chaptered_transcript.video_id,
webpage_url_domain=chaptered_transcript.webpage_url_domain,
to_timestamp_paragraphs=self.to_timestamp_paragraphs,
),
)
for chapter in chapters
]

else:
for chapter in chaptered_transcript.chapters:
paragraphed_sentences = self.paragraphs_segmenter.segment(
transcription_segments=chapter.segments
chapters = [
interfaces.Chapter(
title=chapter.title,
paragraphs=self.paragraphs_segmenter.segment(chapter.segments),
)
chapter_full_text = self._paragraphs_to_text(
paragraphs=paragraphed_sentences,
video_id=chaptered_transcript.video_id,
webpage_url_domain=chaptered_transcript.webpage_url_domain,
to_timestamp_paragraphs=self.to_timestamp_paragraphs,
)
chapter_and_text_list.append((chapter.title, chapter_full_text.strip()))

transcript_text = "\n\n".join(
[
f"{self.chapter_title_template.format(name=chapter_title)}\n\n{chapter_text}"
for chapter_title, chapter_text in chapter_and_text_list
for chapter in chaptered_transcript.chapters
]

rendered_transcript = self._render(
title=chaptered_transcript.title,
chapters=chapters,
video_url=chaptered_transcript.url,
video_id=chaptered_transcript.video_id,
webpage_url_domain=chaptered_transcript.webpage_url_domain,
)
transcript_text = f"{self.video_title_template.format(name=chaptered_transcript.title)}\n\n{chaptered_transcript.url}\n\n{transcript_text}"

return interfaces.FormattedTranscript(
title=chaptered_transcript.title,
transcript=transcript_text,
rendered_transcript=rendered_transcript,
)

def format_chaptered_playlist_transcripts(
Expand Down
2 changes: 1 addition & 1 deletion src/yt2doc/formatting/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Chapter(BaseModel):

class FormattedTranscript(BaseModel):
title: str
transcript: str
rendered_transcript: str


class FormattedPlaylist(BaseModel):
Expand Down
112 changes: 64 additions & 48 deletions src/yt2doc/formatting/llm_adapter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import logging
import typing

from instructor import Instructor
from instructor.exceptions import InstructorRetryException
from pydantic import BaseModel, AfterValidator

logger = logging.getLogger(__name__)


class LLMAdapter:
def __init__(self, llm_client: Instructor, llm_model: str) -> None:
Expand Down Expand Up @@ -36,65 +40,77 @@ class Result(BaseModel):
typing.List[int], AfterValidator(validate_paragraph_indexes)
]

result = self.llm_client.chat.completions.create(
model=self.llm_model,
response_model=Result,
messages=[
{
"role": "system",
"content": """
You are a smart assistant who reads paragraphs of text from an audio transcript and
find the paragraphs that significantly change topic from the previous paragraph.
try:
result = self.llm_client.chat.completions.create(
model=self.llm_model,
response_model=Result,
messages=[
{
"role": "system",
"content": """
You are a smart assistant who reads paragraphs of text from an audio transcript and
find the paragraphs that significantly change topic from the previous paragraph.

Make sure only mark paragraphs that talks about a VERY DIFFERENT topic from the previous one.
Make sure only mark paragraphs that talks about a VERY DIFFERENT topic from the previous one.

The response should be an array of the index number of such paragraphs, such as `[1, 3, 5]`
The response should be an array of the index number of such paragraphs, such as `[1, 3, 5]`

If there is no paragraph that changes topic, then return an empty list.
If there is no paragraph that changes topic, then return an empty list.
""",
},
{
"role": "user",
"content": """
{% for paragraph in paragraphs %}
<paragraph {{ loop.index0 }}>
{{ paragraph }}
</ paragraph {{ loop.index0 }}>
{% endfor %}
""",
},
],
context={
"paragraphs": paragraph_texts,
},
{
"role": "user",
"content": """
{% for paragraph in paragraphs %}
<paragraph {{ loop.index0 }}>
{{ paragraph }}
</ paragraph {{ loop.index0 }}>
{% endfor %}
""",
},
],
context={
"paragraphs": paragraph_texts,
},
)
)
except InstructorRetryException as e:
logger.warning(
f"Failed to get topic changing paragraph from the LLM. Exception: {e}"
)
return []
return result.paragraph_indexes

def generate_title_for_paragraphs(
self, paragraphs: typing.List[typing.List[str]]
) -> str:
text = "\n\n".join(["".join(p) for p in paragraphs])
title = self.llm_client.chat.completions.create(
model=self.llm_model,
response_model=str,
messages=[
{
"role": "system",
"content": """
Please generate a short title for the following text.
try:
title = self.llm_client.chat.completions.create(
model=self.llm_model,
response_model=str,
messages=[
{
"role": "system",
"content": """
Please generate a short title for the following text.

Be VERY SUCCINCT. No more than 6 words.
""",
},
{
"role": "user",
"content": """
{{ text }}
""",
Be VERY SUCCINCT. No more than 6 words.
""",
},
{
"role": "user",
"content": """
{{ text }}
""",
},
],
context={
"text": text,
},
],
context={
"text": text,
},
)
)
except InstructorRetryException as e:
logger.warning(
f"Failed to title for topic segment from the LLM. Exception: {e}"
)
return "Failed to generate title"
return title
Loading
Loading