Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created an ascii converter #61

Merged
merged 1 commit into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@
entry: mdlinker
language: rust
pass_filenames: false
- id: enforce-ascii
name: Enforce ASCII
description: This hook replaces all non-ascii characters with ascii characters
entry: bin/enforce-ascii
language: python
pass_filenames: true
9 changes: 4 additions & 5 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@ Linking works best when you spell things correctly, in both your filenames and f
rev: v1.23.7
hooks:
- id: typos
- repo: https://github.com/sirosen/texthooks
rev: 0.6.8
- repo: https://github.com/ryanpeach/mdlinker
rev: <VERSION>
hooks:
- id: fix-smartquotes
- id: fix-ligatures
- id: fix-spaces
- id: enforce-ascii
- id: mdlinker
```

# Configuration
Expand Down
95 changes: 95 additions & 0 deletions bin/enforce-ascii
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python3
import sys
import argparse
import re

def process_file(file_path, char_map=None):
"""Process a file to find or replace non-ASCII characters."""
non_ascii_chars = set()
output_lines = []
fixed = False
try:
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
non_ascii_chars.update(c for c in line if ord(c) > 127)
if char_map:
fixed_line = ''.join(char_map.get(c, c) for c in line)
output_lines.append(fixed_line)
fixed = fixed or fixed_line != line
else:
output_lines.append(line)

if char_map and output_lines and fixed:
with open(file_path, 'w', encoding='utf-8') as file:
file.writelines(output_lines)
print(f"Fixed non-ASCII characters in: {file_path}")
return non_ascii_chars

except UnicodeDecodeError as e:
print(f"Error reading {file_path}: {e}", file=sys.stderr)
except FileNotFoundError:
print(f"File not found: {file_path}", file=sys.stderr)
return non_ascii_chars

def parse_custom_replacements(replacement_list):
"""Parse custom replacements provided via command line arguments."""
char_map = {}
for item in replacement_list:
if len(item) >= 2 and ':' in item:
key, value = item.split(':', 1)
if key and value:
char_map[key] = value
return char_map

def main():
parser = argparse.ArgumentParser(
description="Find and optionally fix non-ASCII characters in files."
"\nExamples:"
"\n - Find all non-ASCII characters in a file:"
"\n python3 script.py file.txt"
"\n - Replace non-ASCII characters using predefined mappings:"
"\n python3 script.py --fix file.txt"
"\n - Add custom replacements (e.g., replace ‘ with ' and … with ...):"
"\n python3 script.py --fix --replacements '‘:\"' '…:...' file.txt"
"\n - Process multiple files with `find` and `xargs`:"
"\n find . -type f -name \"*.txt\" | xargs python3 script.py --fix",
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('--fix', action='store_true', help="Replace non-ASCII characters based on a predefined or custom map")
parser.add_argument('-r', '--replacements', nargs='*', default=[], help="Custom replacements in the format char:replacement")
parser.add_argument('files', nargs='*', help="Files to process. Use with find | xargs.")
args = parser.parse_args()

# Define default replacement map for non-ASCII characters
char_map = {
'‘': "'", # Left single quotation mark
'’': "'", # Right single quotation mark
'“': '"', # Left double quotation mark
'”': '"', # Right double quotation mark
'…': '...', # Ellipsis
'–': '-', # En-dash
'—': '--', # Em-dash
' ': ' ', # Non-breaking space
}

# Add custom replacements to the map
custom_map = parse_custom_replacements(args.replacements)
char_map.update(custom_map)

# Process each file passed through stdin (xargs-compatible)
if not sys.stdin.isatty(): # If stdin is not empty
files = [line.strip() for line in sys.stdin]
else:
files = args.files

if not files:
print("No files specified or provided via stdin.", file=sys.stderr)
return

for file_path in files:
non_ascii_chars = process_file(file_path, char_map if args.fix else None)
if non_ascii_chars:
print(f"Non-ASCII characters found in {file_path}: {''.join(non_ascii_chars)}")

if __name__ == "__main__":
main()
Loading