shun-liang · shun-liang · Nov 10, 2024 · Nov 7, 2024 · Nov 9, 2024 · Nov 10, 2024
diff --git a/examples/Business Plan Writing 101 Wharton Entrepreneurship Series.md b/examples/Business Plan Writing 101 Wharton Entrepreneurship Series.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,9 +9,11 @@ dependencies = [
     "faster-whisper>=1.0.3",
     "ffmpeg-python>=0.2.0",
     "instructor>=1.5.1",
+    "jinja2>=3.1.4",
     "openai>=1.51.0",
     "pathvalidate>=3.2.1",
     "pydantic>=2.9.1",
+    "python-slugify>=8.0.4",
     "torch>=2.4.1",
     "tqdm>=4.66.5",
     "typer-slim>=0.12.5",

diff --git a/src/yt2doc/cli.py b/src/yt2doc/cli.py
@@ -79,6 +79,11 @@ def main(
         "--timestamp-paragraphs",
         help="Prepend timestamp to paragraphs",
     ),
+    add_table_of_contents: bool = typer.Option(
+        False,
+        "--add-table-of-contents",
+        help="Add table of contents at the beginning of the document",
+    ),
     skip_cache: typing.Annotated[
         bool,
         typer.Option("--skip-cache", help="If should skip reading from cache"),
@@ -151,6 +156,7 @@ def main(
             segment_unchaptered=segment_unchaptered,
             ignore_source_chapters=ignore_source_chapters,
             to_timestamp_paragraphs=to_timestamp_paragraphs,
+            add_table_of_contents=add_table_of_contents,
             llm_model=llm_model,
             llm_server=llm_server,
             llm_api_key=llm_api_key,

diff --git a/src/yt2doc/factories.py b/src/yt2doc/factories.py
@@ -33,6 +33,7 @@ def get_yt2doc(
     segment_unchaptered: bool,
     ignore_source_chapters: bool,
     to_timestamp_paragraphs: bool,
+    add_table_of_contents: bool,
     llm_model: typing.Optional[str],
     llm_server: str,
     llm_api_key: str,
@@ -63,12 +64,14 @@ def get_yt2doc(
         formatter = MarkdownFormatter(
             paragraphs_segmenter=paragraphs_segmenter,
             to_timestamp_paragraphs=to_timestamp_paragraphs,
+            add_table_of_contents=add_table_of_contents,
             topic_segmenter=llm_topic_segmenter,
         )
     else:
         formatter = MarkdownFormatter(
             paragraphs_segmenter=paragraphs_segmenter,
             to_timestamp_paragraphs=to_timestamp_paragraphs,
+            add_table_of_contents=add_table_of_contents,
         )
 
     media_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)

diff --git a/src/yt2doc/formatting/formatter.py b/src/yt2doc/formatting/formatter.py
@@ -1,55 +1,110 @@
 import typing
 import logging
 
+import jinja2
+
 from datetime import timedelta
+from pathlib import Path
+
+from pydantic import BaseModel
+from slugify import slugify
 
 from yt2doc.extraction import interfaces as extraction_interfaces
 from yt2doc.formatting import interfaces
 
 logger = logging.getLogger(__file__)
 
 
+class ParagraphToRender(BaseModel):
+    start_h_m_s: str
+    text: str
+
+
+class ChapterToRender(BaseModel):
+    title: str
+    custom_id: str
+    start_h_m_s: str
+    paragraphs: typing.Sequence[ParagraphToRender]
+
+
 class MarkdownFormatter:
     def __init__(
         self,
         paragraphs_segmenter: interfaces.IParagraphsSegmenter,
         to_timestamp_paragraphs: bool,
+        add_table_of_contents: bool,
         topic_segmenter: typing.Optional[interfaces.ITopicSegmenter] = None,
     ) -> None:
         self.paragraphs_segmenter = paragraphs_segmenter
         self.topic_segmenter = topic_segmenter
         self.video_title_template = "# {name}"
         self.chapter_title_template = "## {name}"
         self.to_timestamp_paragraphs = to_timestamp_paragraphs
+        self.add_table_of_contents = add_table_of_contents
 
     @staticmethod
-    def _paragraphs_to_text(
-        paragraphs: typing.Sequence[typing.Sequence[interfaces.Sentence]],
+    def _start_second_to_start_h_m_s(
+        start_second: float, webpage_url_domain: str, video_id: str
+    ) -> str:
+        rounded_start_second = round(start_second)
+        start_h_m_s = str(timedelta(seconds=rounded_start_second))
+        if webpage_url_domain == "youtube.com":
+            return (
+                f"[{start_h_m_s}](https://youtu.be/{video_id}?t={rounded_start_second})"
+            )
+        return start_h_m_s
+
+    def _render(
+        self,
+        title: str,
+        chapters: typing.Sequence[interfaces.Chapter],
+        video_url: str,
         video_id: str,
         webpage_url_domain: str,
-        to_timestamp_paragraphs: bool,
     ) -> str:
-        paragraph_texts = []
-        for paragraph in paragraphs:
-            first_sentence = paragraph[0]
-            paragraph_text = "".join(sentence.text for sentence in paragraph)
-            paragraph_text = paragraph_text.strip()
-            if to_timestamp_paragraphs:
-                paragraph_start_second = round(first_sentence.start_second)
-                paragraph_start_h_m_s = str(timedelta(seconds=paragraph_start_second))
-                if webpage_url_domain == "youtube.com":
-                    timestamp_prefix = f"[({paragraph_start_h_m_s})](https://youtu.be/{video_id}?t={paragraph_start_second})"
-                else:
-                    timestamp_prefix = f"({paragraph_start_h_m_s})"
-                paragraph_text = f"{timestamp_prefix} {paragraph_text}"
-            paragraph_texts.append(paragraph_text)
-        return "\n\n".join(paragraph_texts)
+        chapters_to_render: typing.List[ChapterToRender] = []
+        for chapter in chapters:
+            if len(chapter.paragraphs) == 0:
+                continue
+
+            paragraphs_to_render = [
+                ParagraphToRender(
+                    text=("".join([sentence.text for sentence in paragraph])).strip(),
+                    start_h_m_s=self._start_second_to_start_h_m_s(
+                        start_second=paragraph[0].start_second,
+                        webpage_url_domain=webpage_url_domain,
+                        video_id=video_id,
+                    ),
+                )
+                for paragraph in chapter.paragraphs
+            ]
+            first_paragraph_to_render = paragraphs_to_render[0]
+            chapters_to_render.append(
+                ChapterToRender(
+                    title=chapter.title,
+                    custom_id=slugify(chapter.title),
+                    start_h_m_s=first_paragraph_to_render.start_h_m_s,
+                    paragraphs=paragraphs_to_render,
+                )
+            )
+
+        current_dir = Path(__file__).parent
+        jinja_environment = jinja2.Environment(
+            loader=jinja2.FileSystemLoader(current_dir)
+        )
+        template = jinja_environment.get_template("template.md")
+        rendered = template.render(
+            title=title,
+            chapters=[chapter.model_dump() for chapter in chapters_to_render],
+            video_url=video_url,
+            add_table_of_contents=self.add_table_of_contents,
+            to_timestamp_paragraphs=self.to_timestamp_paragraphs,
+        )
+        return rendered
 
     def format_chaptered_transcript(
         self, chaptered_transcript: extraction_interfaces.ChapteredTranscript
     ) -> interfaces.FormattedTranscript:
-        chapter_and_text_list: typing.List[typing.Tuple[str, str]] = []
-
         if (
             self.topic_segmenter is not None
             and not chaptered_transcript.chaptered_at_source
@@ -62,42 +117,27 @@ def format_chaptered_transcript(
             chapters = self.topic_segmenter.segment(
                 sentences_in_paragraphs=paragraphed_sentences
             )
-            chapter_and_text_list = [
-                (
-                    chapter.title,
-                    self._paragraphs_to_text(
-                        paragraphs=chapter.paragraphs,
-                        video_id=chaptered_transcript.video_id,
-                        webpage_url_domain=chaptered_transcript.webpage_url_domain,
-                        to_timestamp_paragraphs=self.to_timestamp_paragraphs,
-                    ),
-                )
-                for chapter in chapters
-            ]
 
         else:
-            for chapter in chaptered_transcript.chapters:
-                paragraphed_sentences = self.paragraphs_segmenter.segment(
-                    transcription_segments=chapter.segments
+            chapters = [
+                interfaces.Chapter(
+                    title=chapter.title,
+                    paragraphs=self.paragraphs_segmenter.segment(chapter.segments),
                 )
-                chapter_full_text = self._paragraphs_to_text(
-                    paragraphs=paragraphed_sentences,
-                    video_id=chaptered_transcript.video_id,
-                    webpage_url_domain=chaptered_transcript.webpage_url_domain,
-                    to_timestamp_paragraphs=self.to_timestamp_paragraphs,
-                )
-                chapter_and_text_list.append((chapter.title, chapter_full_text.strip()))
-
-        transcript_text = "\n\n".join(
-            [
-                f"{self.chapter_title_template.format(name=chapter_title)}\n\n{chapter_text}"
-                for chapter_title, chapter_text in chapter_and_text_list
+                for chapter in chaptered_transcript.chapters
             ]
+
+        rendered_transcript = self._render(
+            title=chaptered_transcript.title,
+            chapters=chapters,
+            video_url=chaptered_transcript.url,
+            video_id=chaptered_transcript.video_id,
+            webpage_url_domain=chaptered_transcript.webpage_url_domain,
         )
-        transcript_text = f"{self.video_title_template.format(name=chaptered_transcript.title)}\n\n{chaptered_transcript.url}\n\n{transcript_text}"
+
         return interfaces.FormattedTranscript(
             title=chaptered_transcript.title,
-            transcript=transcript_text,
+            rendered_transcript=rendered_transcript,
         )
 
     def format_chaptered_playlist_transcripts(

diff --git a/src/yt2doc/formatting/interfaces.py b/src/yt2doc/formatting/interfaces.py
@@ -18,7 +18,7 @@ class Chapter(BaseModel):
 
 class FormattedTranscript(BaseModel):
     title: str
-    transcript: str
+    rendered_transcript: str
 
 
 class FormattedPlaylist(BaseModel):

diff --git a/src/yt2doc/formatting/llm_adapter.py b/src/yt2doc/formatting/llm_adapter.py
@@ -1,8 +1,12 @@
+import logging
 import typing
 
 from instructor import Instructor
+from instructor.exceptions import InstructorRetryException
 from pydantic import BaseModel, AfterValidator
 
+logger = logging.getLogger(__name__)
+
 
 class LLMAdapter:
     def __init__(self, llm_client: Instructor, llm_model: str) -> None:
@@ -36,65 +40,77 @@ class Result(BaseModel):
                 typing.List[int], AfterValidator(validate_paragraph_indexes)
             ]
 
-        result = self.llm_client.chat.completions.create(
-            model=self.llm_model,
-            response_model=Result,
-            messages=[
-                {
-                    "role": "system",
-                    "content": """
-                        You are a smart assistant who reads paragraphs of text from an audio transcript and
-                        find the paragraphs that significantly change topic from the previous paragraph.
+        try:
+            result = self.llm_client.chat.completions.create(
+                model=self.llm_model,
+                response_model=Result,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": """
+                            You are a smart assistant who reads paragraphs of text from an audio transcript and
+                            find the paragraphs that significantly change topic from the previous paragraph.
 
-                        Make sure only mark paragraphs that talks about a VERY DIFFERENT topic from the previous one.
+                            Make sure only mark paragraphs that talks about a VERY DIFFERENT topic from the previous one.
 
-                        The response should be an array of the index number of such paragraphs, such as `[1, 3, 5]`
+                            The response should be an array of the index number of such paragraphs, such as `[1, 3, 5]`
 
-                        If there is no paragraph that changes topic, then return an empty list.
+                            If there is no paragraph that changes topic, then return an empty list.
+                            """,
+                    },
+                    {
+                        "role": "user",
+                        "content": """
+                            {% for paragraph in paragraphs %}
+                            <paragraph {{ loop.index0 }}>
+                            {{ paragraph }}
+                            </ paragraph {{ loop.index0 }}>
+                            {% endfor %}
                         """,
+                    },
+                ],
+                context={
+                    "paragraphs": paragraph_texts,
                 },
-                {
-                    "role": "user",
-                    "content": """
-                        {% for paragraph in paragraphs %}
-                        <paragraph {{ loop.index0 }}>
-                        {{ paragraph }}
-                        </ paragraph {{ loop.index0 }}>
-                        {% endfor %}
-                    """,
-                },
-            ],
-            context={
-                "paragraphs": paragraph_texts,
-            },
-        )
+            )
+        except InstructorRetryException as e:
+            logger.warning(
+                f"Failed to get topic changing paragraph from the LLM. Exception: {e}"
+            )
+            return []
         return result.paragraph_indexes
 
     def generate_title_for_paragraphs(
         self, paragraphs: typing.List[typing.List[str]]
     ) -> str:
         text = "\n\n".join(["".join(p) for p in paragraphs])
-        title = self.llm_client.chat.completions.create(
-            model=self.llm_model,
-            response_model=str,
-            messages=[
-                {
-                    "role": "system",
-                    "content": """
-                        Please generate a short title for the following text.
+        try:
+            title = self.llm_client.chat.completions.create(
+                model=self.llm_model,
+                response_model=str,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": """
+                            Please generate a short title for the following text.
 
-                        Be VERY SUCCINCT. No more than 6 words.
-                    """,
-                },
-                {
-                    "role": "user",
-                    "content": """
-                        {{ text }}
-                    """,
+                            Be VERY SUCCINCT. No more than 6 words.
+                        """,
+                    },
+                    {
+                        "role": "user",
+                        "content": """
+                            {{ text }}
+                        """,
+                    },
+                ],
+                context={
+                    "text": text,
                 },
-            ],
-            context={
-                "text": text,
-            },
-        )
+            )
+        except InstructorRetryException as e:
+            logger.warning(
+                f"Failed to title for topic segment from the LLM. Exception: {e}"
+            )
+            return "Failed to generate title"
         return title