Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

⌛ Add progress bar and file type distribution #15

Merged
merged 3 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<div align="center">

<img width="80%" src="codebase.svg" alt="codebase.svg"><br>
<img width="75%" src="codebase.svg" alt="codebase.svg"><br>

<p>data consolidation tool.</p>

Expand Down
85 changes: 61 additions & 24 deletions codebase/collector.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import logging
import os
import re
from collections import Counter

import tiktoken
from tqdm import tqdm

from .filter import filter_extensions, read_codebaseignore

Expand All @@ -26,13 +28,33 @@ def escape_markdown_characters(file_name):

def count_lines_of_code(content):
"""
Count the lines of code within code blocks in the markdown content.
Counts the lines of code within each code blocks in the output markdown file.
"""
codeblocks = re.findall(r"```[\s\S]*?```", content)
lines_of_code = sum(len(block.split("\n")) - 2 for block in codeblocks) # subtracts 2x ``` from codeblocks
return lines_of_code


def get_file_type_distribution(markdown_content):
"""
Returns a distribution of the four most common file types in the output markdown file.
"""
file_types = [line.split(".")[-1] for line in markdown_content.split("\n") if line.startswith("####")]
type_counter = Counter(file_types)
total_files = len(file_types)

most_common_types = type_counter.most_common(4)
type_distribution = [(file_type, count / total_files * 100) for file_type, count in most_common_types]

if len(type_counter) > 4:
other_count = sum(
count for file_type, count in type_counter.items() if file_type not in dict(most_common_types)
)
type_distribution.append(("other", other_count / total_files * 100))

return type_distribution


def count_tokens(text):
"""
Encoding for GPT-3.5/GPT-4.0.
Expand All @@ -56,33 +78,48 @@ def consolidate(path, extensions=None):

for root, dirs, files in os.walk(path):
dirs[:] = [d for d in dirs if not exclude_files(os.path.relpath(str(os.path.join(root, d)), path))]
file_count += sum(
1 for file in files if not exclude_files(os.path.relpath(str(os.path.join(root, file)), path))
)

with tqdm(
total=file_count,
unit="file",
ncols=100,
bar_format="▶️ | {desc}: {bar:45} {percentage:3.0f}% | {n_fmt}/{total_fmt}",
) as progress_bar:
for root, dirs, files in os.walk(path):
dirs[:] = [d for d in dirs if not exclude_files(os.path.relpath(str(os.path.join(root, d)), path))]

for file in files:
file_path = os.path.join(root, file)
relative_path = os.path.relpath(str(file_path), path)

if (extensions and not filter_extensions(file_path, extensions)) or exclude_files(relative_path):
continue

for file in files:
file_path = os.path.join(root, file)
relative_path = os.path.relpath(str(file_path), path)

if (extensions and not filter_extensions(file_path, extensions)) or exclude_files(relative_path):
continue
_, file_extension = os.path.splitext(file)
_, file_extension = os.path.splitext(file)

try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
except UnicodeDecodeError:
try:
with open(file_path, "r", encoding="iso-8859-1") as f:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
except (OSError, IOError) as e:
_logger.warning("Unable to read %s: %s. Skipping this file.", file_path, str(e))
continue

escaped_relative_path = escape_markdown_characters(relative_path)
file_content = f"\n#### {escaped_relative_path}\n\n```{file_extension[1:]}\n{content.rstrip()}\n```\n"
codebase += file_content
file_count += 1
token_count += count_tokens(file_content)
lines_of_code_count += len(content.split("\n"))
except UnicodeDecodeError:
try:
with open(file_path, "r", encoding="iso-8859-1") as f:
content = f.read()
except (OSError, IOError) as e:
_logger.warning("Unable to read %s: %s. Skipping this file.", file_path, str(e))
continue

escaped_relative_path = escape_markdown_characters(relative_path)
file_content = f"\n#### {escaped_relative_path}\n\n```{file_extension[1:]}\n{content.rstrip()}\n```\n"
codebase += file_content
token_count += count_tokens(file_content)
lines_of_code_count += len(content.split("\n"))

progress_bar.update(1)

codebase = remove_trailing_whitespace(codebase)
type_distribution = get_file_type_distribution(codebase)

return codebase, file_count, token_count, lines_of_code_count
return codebase, file_count, token_count, lines_of_code_count, type_distribution
18 changes: 15 additions & 3 deletions codebase/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def path_prompt(message, default, exists=False):
multiple=True,
help="enables optional filtering by extensions, for instance: -f py,json", # markdown contains only .py/.json files
)
# pylint: disable=too-many-locals
def generate_markdown(input_path, output_path, extension_filter):
no_flags_provided = input_path is None and output_path is None and not extension_filter
project_root = get_project_root()
Expand All @@ -94,7 +95,9 @@ def generate_markdown(input_path, output_path, extension_filter):
extensions = parse_extensions(None, None, [extensions_input])

extensions = list(extensions) if extensions else None
markdown_content, file_count, lines_of_code_count, token_count = consolidate(input_path, extensions)
markdown_content, file_count, token_count, lines_of_code_count, type_distribution = consolidate(
input_path, extensions
)

if len(markdown_content.encode("utf-8")) > MAX_FILE_SIZE:
_logger.error(
Expand All @@ -116,27 +119,36 @@ def generate_markdown(input_path, output_path, extension_filter):
else:
file_size = f"{codebase_size / (1024 * 1024):.2f} MB"

file_type_distribution = " ".join(
f".{file_type} ({percentage:.0f}%)" for file_type, percentage in type_distribution
)

_logger.info(
"\n"
+ "🟢 CODEBASE CONSOLIDATED SUCCESSFULLY \n"
+ "🟢 CODEBASE CONSOLIDATED SUCCESSFULLY.\n"
+ "\n"
+ "📁 MARKDOWN FILE LOCATION: %s"
+ "\n"
+ "💾 MARKDOWN FILE SIZE: %s"
+ "\n"
+ "📄 FILES PROCESSED: %d"
+ "\n"
+ "📊 LINES OF CODE: %d"
+ "📊 TYPE DISTRIBUTION: %s"
+ "\n"
+ "📈 LINES OF CODE: %d"
+ "\n"
+ "🪙 TOKEN COUNT: %d"
+ "\n",
codebase,
file_size,
file_count,
file_type_distribution,
lines_of_code_count,
token_count,
)


# to run the script during local development, either execute $ python -m codebase
# or install codebase locally via `pdm install` and simply run $ codebase
if __name__ == "__main__":
generate_markdown.main(standalone_mode=False)
27 changes: 26 additions & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ dependencies = [
"click>=8.1.7",
"prompt_toolkit>=3.0.47",
"tiktoken>=0.7.0",
"tqdm>=4.66.5",
"types-tqdm>=4.66.0",
]
dynamic = ["version"]
readme = "README.md"
Expand Down
4 changes: 3 additions & 1 deletion requirements.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
click
prompt_toolkit
tiktoken
tiktoken
tqdm
types-tqdm
8 changes: 7 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ charset-normalizer==3.3.2
click==8.1.7
# via -r requirements.in
colorama==0.4.6
# via click
# via
# click
# tqdm
idna==3.7
# via requests
prompt-toolkit==3.0.47
Expand All @@ -22,6 +24,10 @@ requests==2.32.3
# via tiktoken
tiktoken==0.7.0
# via -r requirements.in
tqdm==4.66.5
# via -r requirements.in
types-tqdm==4.66.0.20240417
# via -r requirements.in
urllib3==2.2.2
# via requests
wcwidth==0.2.13
Expand Down
4 changes: 2 additions & 2 deletions unittests/test_extension_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
def test_consolidate_only_specified_filters(
project_root, mock_project, mock_operations, mock_codebaseignore
): # pylint: disable=unused-argument
filtered_codebase, _, _, _ = consolidate(project_root, extensions=["md", "txt"])
filtered_codebase, *_ = consolidate(project_root, extensions=["md", "txt"])

assert not any(extension in mock_codebaseignore for extension in [".md", ".txt", ".py", ".yml"])
assert re.search(rf"#### {re.escape(escape_markdown_characters('markdown.md'))}", filtered_codebase)
Expand All @@ -32,7 +32,7 @@ def test_consolidate_only_specified_filters(
def test_extension_filter_bypasses_codebaseignore(
project_root, mock_project, mock_operations, mock_codebaseignore
): # pylint: disable=unused-argument
filtered_codebase, _, _, _ = consolidate(project_root, extensions=["svg"])
filtered_codebase, *_ = consolidate(project_root, extensions=["svg"])

assert ".svg" in mock_codebaseignore
assert re.search(
Expand Down
38 changes: 28 additions & 10 deletions unittests/test_file_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def test_consolidate_excludes_ignored_files(
project_root, mock_project, mock_operations
): # pylint: disable=unused-argument
codebase, _, _, _ = consolidate(project_root)
codebase, *_ = consolidate(project_root)
codebaseignore = mock_project[os.path.join(project_root, ".codebaseignore")]

assert ".png" in codebaseignore
Expand All @@ -28,7 +28,7 @@ def test_consolidate_excludes_ignored_files(
def test_consolidate_considers_subdirectories(
project_root, mock_project, mock_operations
): # pylint: disable=unused-argument
codebase, _, _, _ = consolidate(project_root)
codebase, *_ = consolidate(project_root)

print(f"Mock project structure: {mock_project}")
print(f"Consolidated codebase:\n{codebase}")
Expand All @@ -49,7 +49,7 @@ def test_consolidate_considers_subdirectories(


def test_consolidate_file_token_count(project_root, mock_project, mock_operations): # pylint: disable=unused-argument
codebase, file_count, token_count, lines_of_code_count = consolidate(project_root)
_, file_count, token_count, *_ = consolidate(project_root)

expected_file_count = len(
[
Expand All @@ -59,21 +59,39 @@ def test_consolidate_file_token_count(project_root, mock_project, mock_operation
]
)

assert file_count == expected_file_count
assert token_count > 0


def test_consolidate_line_of_code_count(project_root, mock_project, mock_operations): # pylint: disable=unused-argument
_, lines_of_code_count, *_ = consolidate(project_root)

expected_lines_of_code_count = sum(
len(content.split("\n"))
for file_path, content in mock_project.items()
if not file_path.endswith((".codebaseignore", ".png", ".svg"))
)

assert file_count == expected_file_count
assert token_count > 0
assert lines_of_code_count == expected_lines_of_code_count

for file_path, content in mock_project.items():
if not file_path.endswith((".codebaseignore", ".png", ".svg")):
escaped_path = escape_markdown_characters(os.path.relpath(file_path, project_root))
assert re.search(rf"#### {re.escape(escaped_path)}", codebase)
assert content in codebase

def test_consolidate_file_type_distribution(
project_root, mock_project, mock_operations
): # pylint: disable=unused-argument
codebase, file_count, *_ = consolidate(project_root)

expected_types = {
"py": 1, # mock_project/python.py
"md": 1, # mock_project/markdown.md
"txt": 1, # mock_project/text.txt
"yml": 1, # mock_project/subdirectory/markup.yml
}
file_type_distribution = sum(expected_types.values())

assert file_count == file_type_distribution

for file_type in expected_types:
assert re.search(rf"#### .*\.{file_type.lower()}", codebase, re.IGNORECASE)


def test_consolidate_removes_trailing_whitespace():
Expand Down