OLILHR · OLILHR · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
 
-<img width="80%" src="codebase.svg" alt="codebase.svg"><br>
+<img width="75%" src="codebase.svg" alt="codebase.svg"><br>
 
 <p>data consolidation tool.</p>
 

diff --git a/codebase/collector.py b/codebase/collector.py
@@ -1,8 +1,10 @@
 import logging
 import os
 import re
+from collections import Counter
 
 import tiktoken
+from tqdm import tqdm
 
 from .filter import filter_extensions, read_codebaseignore
 
@@ -26,13 +28,33 @@ def escape_markdown_characters(file_name):
 
 def count_lines_of_code(content):
     """
-    Count the lines of code within code blocks in the markdown content.
+    Counts the lines of code within each code blocks in the output markdown file.
     """
     codeblocks = re.findall(r"```[\s\S]*?```", content)
     lines_of_code = sum(len(block.split("\n")) - 2 for block in codeblocks)  # subtracts 2x ``` from codeblocks
     return lines_of_code
 
 
+def get_file_type_distribution(markdown_content):
+    """
+    Returns a distribution of the four most common file types in the output markdown file.
+    """
+    file_types = [line.split(".")[-1] for line in markdown_content.split("\n") if line.startswith("####")]
+    type_counter = Counter(file_types)
+    total_files = len(file_types)
+
+    most_common_types = type_counter.most_common(4)
+    type_distribution = [(file_type, count / total_files * 100) for file_type, count in most_common_types]
+
+    if len(type_counter) > 4:
+        other_count = sum(
+            count for file_type, count in type_counter.items() if file_type not in dict(most_common_types)
+        )
+        type_distribution.append(("other", other_count / total_files * 100))
+
+    return type_distribution
+
+
 def count_tokens(text):
     """
     Encoding for GPT-3.5/GPT-4.0.
@@ -56,33 +78,48 @@ def consolidate(path, extensions=None):
 
     for root, dirs, files in os.walk(path):
         dirs[:] = [d for d in dirs if not exclude_files(os.path.relpath(str(os.path.join(root, d)), path))]
+        file_count += sum(
+            1 for file in files if not exclude_files(os.path.relpath(str(os.path.join(root, file)), path))
+        )
+
+    with tqdm(
+        total=file_count,
+        unit="file",
+        ncols=100,
+        bar_format="▶️ | {desc}: {bar:45} {percentage:3.0f}% | {n_fmt}/{total_fmt}",
+    ) as progress_bar:
+        for root, dirs, files in os.walk(path):
+            dirs[:] = [d for d in dirs if not exclude_files(os.path.relpath(str(os.path.join(root, d)), path))]
+
+            for file in files:
+                file_path = os.path.join(root, file)
+                relative_path = os.path.relpath(str(file_path), path)
+
+                if (extensions and not filter_extensions(file_path, extensions)) or exclude_files(relative_path):
+                    continue
 
-        for file in files:
-            file_path = os.path.join(root, file)
-            relative_path = os.path.relpath(str(file_path), path)
-
-            if (extensions and not filter_extensions(file_path, extensions)) or exclude_files(relative_path):
-                continue
-            _, file_extension = os.path.splitext(file)
+                _, file_extension = os.path.splitext(file)
 
-            try:
-                with open(file_path, "r", encoding="utf-8") as f:
-                    content = f.read()
-            except UnicodeDecodeError:
                 try:
-                    with open(file_path, "r", encoding="iso-8859-1") as f:
+                    with open(file_path, "r", encoding="utf-8") as f:
                         content = f.read()
-                except (OSError, IOError) as e:
-                    _logger.warning("Unable to read %s: %s. Skipping this file.", file_path, str(e))
-                    continue
-
-            escaped_relative_path = escape_markdown_characters(relative_path)
-            file_content = f"\n#### {escaped_relative_path}\n\n```{file_extension[1:]}\n{content.rstrip()}\n```\n"
-            codebase += file_content
-            file_count += 1
-            token_count += count_tokens(file_content)
-            lines_of_code_count += len(content.split("\n"))
+                except UnicodeDecodeError:
+                    try:
+                        with open(file_path, "r", encoding="iso-8859-1") as f:
+                            content = f.read()
+                    except (OSError, IOError) as e:
+                        _logger.warning("Unable to read %s: %s. Skipping this file.", file_path, str(e))
+                        continue
+
+                escaped_relative_path = escape_markdown_characters(relative_path)
+                file_content = f"\n#### {escaped_relative_path}\n\n```{file_extension[1:]}\n{content.rstrip()}\n```\n"
+                codebase += file_content
+                token_count += count_tokens(file_content)
+                lines_of_code_count += len(content.split("\n"))
+
+                progress_bar.update(1)
 
     codebase = remove_trailing_whitespace(codebase)
+    type_distribution = get_file_type_distribution(codebase)
 
-    return codebase, file_count, token_count, lines_of_code_count
+    return codebase, file_count, token_count, lines_of_code_count, type_distribution
diff --git a/codebase/main.py b/codebase/main.py
@@ -69,6 +69,7 @@ def path_prompt(message, default, exists=False):
     multiple=True,
     help="enables optional filtering by extensions, for instance: -f py,json",  # markdown contains only .py/.json files
 )
+# pylint: disable=too-many-locals
 def generate_markdown(input_path, output_path, extension_filter):
     no_flags_provided = input_path is None and output_path is None and not extension_filter
     project_root = get_project_root()
@@ -94,7 +95,9 @@ def generate_markdown(input_path, output_path, extension_filter):
             extensions = parse_extensions(None, None, [extensions_input])
 
     extensions = list(extensions) if extensions else None
-    markdown_content, file_count, lines_of_code_count, token_count = consolidate(input_path, extensions)
+    markdown_content, file_count, token_count, lines_of_code_count, type_distribution = consolidate(
+        input_path, extensions
+    )
 
     if len(markdown_content.encode("utf-8")) > MAX_FILE_SIZE:
         _logger.error(
@@ -116,27 +119,36 @@ def generate_markdown(input_path, output_path, extension_filter):
     else:
         file_size = f"{codebase_size / (1024 * 1024):.2f} MB"
 
+    file_type_distribution = " ".join(
+        f".{file_type} ({percentage:.0f}%)" for file_type, percentage in type_distribution
+    )
+
     _logger.info(
         "\n"
-        + "🟢 CODEBASE CONSOLIDATED SUCCESSFULLY \n"
+        + "🟢 CODEBASE CONSOLIDATED SUCCESSFULLY.\n"
         + "\n"
         + "📁 MARKDOWN FILE LOCATION: %s"
         + "\n"
         + "💾 MARKDOWN FILE SIZE: %s"
         + "\n"
         + "📄 FILES PROCESSED: %d"
         + "\n"
-        + "📊 LINES OF CODE: %d"
+        + "📊 TYPE DISTRIBUTION: %s"
+        + "\n"
+        + "📈 LINES OF CODE: %d"
         + "\n"
         + "🪙 TOKEN COUNT: %d"
         + "\n",
         codebase,
         file_size,
         file_count,
+        file_type_distribution,
         lines_of_code_count,
         token_count,
     )
 
 
+# to run the script during local development, either execute $ python -m codebase
+# or install codebase locally via `pdm install` and simply run $ codebase
 if __name__ == "__main__":
     generate_markdown.main(standalone_mode=False)
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,8 @@ dependencies = [
     "click>=8.1.7",
     "prompt_toolkit>=3.0.47",
     "tiktoken>=0.7.0",
+    "tqdm>=4.66.5",
+    "types-tqdm>=4.66.0",
 ]
 dynamic = ["version"]
 readme = "README.md"

diff --git a/requirements.in b/requirements.in
@@ -1,3 +1,5 @@
 click
 prompt_toolkit
-tiktoken
+tiktoken
+tqdm
+types-tqdm
diff --git a/requirements.txt b/requirements.txt
@@ -11,7 +11,9 @@ charset-normalizer==3.3.2
 click==8.1.7
     # via -r requirements.in
 colorama==0.4.6
-    # via click
+    # via
+    #   click
+    #   tqdm
 idna==3.7
     # via requests
 prompt-toolkit==3.0.47
@@ -22,6 +24,10 @@ requests==2.32.3
     # via tiktoken
 tiktoken==0.7.0
     # via -r requirements.in
+tqdm==4.66.5
+    # via -r requirements.in
+types-tqdm==4.66.0.20240417
+    # via -r requirements.in
 urllib3==2.2.2
     # via requests
 wcwidth==0.2.13

diff --git a/unittests/test_extension_filter.py b/unittests/test_extension_filter.py
@@ -8,7 +8,7 @@
 def test_consolidate_only_specified_filters(
     project_root, mock_project, mock_operations, mock_codebaseignore
 ):  # pylint: disable=unused-argument
-    filtered_codebase, _, _, _ = consolidate(project_root, extensions=["md", "txt"])
+    filtered_codebase, *_ = consolidate(project_root, extensions=["md", "txt"])
 
     assert not any(extension in mock_codebaseignore for extension in [".md", ".txt", ".py", ".yml"])
     assert re.search(rf"#### {re.escape(escape_markdown_characters('markdown.md'))}", filtered_codebase)
@@ -32,7 +32,7 @@ def test_consolidate_only_specified_filters(
 def test_extension_filter_bypasses_codebaseignore(
     project_root, mock_project, mock_operations, mock_codebaseignore
 ):  # pylint: disable=unused-argument
-    filtered_codebase, _, _, _ = consolidate(project_root, extensions=["svg"])
+    filtered_codebase, *_ = consolidate(project_root, extensions=["svg"])
 
     assert ".svg" in mock_codebaseignore
     assert re.search(

diff --git a/unittests/test_file_collector.py b/unittests/test_file_collector.py
@@ -9,7 +9,7 @@
 def test_consolidate_excludes_ignored_files(
     project_root, mock_project, mock_operations
 ):  # pylint: disable=unused-argument
-    codebase, _, _, _ = consolidate(project_root)
+    codebase, *_ = consolidate(project_root)
     codebaseignore = mock_project[os.path.join(project_root, ".codebaseignore")]
 
     assert ".png" in codebaseignore
@@ -28,7 +28,7 @@ def test_consolidate_excludes_ignored_files(
 def test_consolidate_considers_subdirectories(
     project_root, mock_project, mock_operations
 ):  # pylint: disable=unused-argument
-    codebase, _, _, _ = consolidate(project_root)
+    codebase, *_ = consolidate(project_root)
 
     print(f"Mock project structure: {mock_project}")
     print(f"Consolidated codebase:\n{codebase}")
@@ -49,7 +49,7 @@ def test_consolidate_considers_subdirectories(
 
 
 def test_consolidate_file_token_count(project_root, mock_project, mock_operations):  # pylint: disable=unused-argument
-    codebase, file_count, token_count, lines_of_code_count = consolidate(project_root)
+    _, file_count, token_count, *_ = consolidate(project_root)
 
     expected_file_count = len(
         [
@@ -59,21 +59,39 @@ def test_consolidate_file_token_count(project_root, mock_project, mock_operation
         ]
     )
 
+    assert file_count == expected_file_count
+    assert token_count > 0
+
+
+def test_consolidate_line_of_code_count(project_root, mock_project, mock_operations):  # pylint: disable=unused-argument
+    _, lines_of_code_count, *_ = consolidate(project_root)
+
     expected_lines_of_code_count = sum(
         len(content.split("\n"))
         for file_path, content in mock_project.items()
         if not file_path.endswith((".codebaseignore", ".png", ".svg"))
     )
 
-    assert file_count == expected_file_count
-    assert token_count > 0
     assert lines_of_code_count == expected_lines_of_code_count
 
-    for file_path, content in mock_project.items():
-        if not file_path.endswith((".codebaseignore", ".png", ".svg")):
-            escaped_path = escape_markdown_characters(os.path.relpath(file_path, project_root))
-            assert re.search(rf"#### {re.escape(escaped_path)}", codebase)
-            assert content in codebase
+
+def test_consolidate_file_type_distribution(
+    project_root, mock_project, mock_operations
+):  # pylint: disable=unused-argument
+    codebase, file_count, *_ = consolidate(project_root)
+
+    expected_types = {
+        "py": 1,  # mock_project/python.py
+        "md": 1,  # mock_project/markdown.md
+        "txt": 1,  # mock_project/text.txt
+        "yml": 1,  # mock_project/subdirectory/markup.yml
+    }
+    file_type_distribution = sum(expected_types.values())
+
+    assert file_count == file_type_distribution
+
+    for file_type in expected_types:
+        assert re.search(rf"#### .*\.{file_type.lower()}", codebase, re.IGNORECASE)
 
 
 def test_consolidate_removes_trailing_whitespace():