-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilescan.py
88 lines (69 loc) · 3.29 KB
/
filescan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import re
import yaml
import time
from tqdm import tqdm
def load_rules():
with open('rules.yml', 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def load_config():
with open('config.yml', 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def should_skip_file(file_path, exclude_suffixes):
file_extension = os.path.splitext(file_path)[1].lstrip('.')
return file_extension in exclude_suffixes
def scan_file(file_path, rules):
rule_results = {rule_name: [] for rule_name in rules}
with open(file_path, 'r', encoding='utf-8', errors='ignore', buffering=8192) as f:
for line_num, line in enumerate(f, 1):
for rule_name, rule_pattern in rules.items():
matches = rule_pattern.finditer(line)
for match in matches:
rule_results[rule_name].append((file_path, line_num, match.group()))
return rule_results
def scan_directory(directory, rules, exclude_suffixes):
all_rule_results = {rule_name: [] for rule_name in rules}
file_count = sum([len(files) for _, _, files in os.walk(directory)])
terminal_width = os.get_terminal_size().columns
threshold_width = 100
ncols = min(terminal_width, threshold_width)
with tqdm(total=file_count, desc='Scanning files', ncols=ncols) as pbar:
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
if should_skip_file(file_path, exclude_suffixes):
pbar.update(1)
continue
file_rule_results = scan_file(file_path, rules)
for rule_name in rules:
all_rule_results[rule_name].extend(file_rule_results[rule_name])
pbar.update(1)
return all_rule_results
def write_results_to_file(results, output_file, base_dir):
print(f"结果输出到目标文件:{output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
for rule_name in results:
f.write(f"规则名:{rule_name}\n")
for file_path, line_num, match_content in results[rule_name]:
relative_path = os.path.relpath(file_path, base_dir)
f.write(f"{rule_name}:文件路径:{relative_path} 行号:{line_num} 匹配到的内容:{match_content}\n")
f.write("\n")
def main():
import argparse
parser = argparse.ArgumentParser(description='文件内容敏感信息扫描工具')
parser.add_argument('-d', '--directory', required=True, help='指定要扫描的目录')
parser.add_argument('-o', '--output_file', required=True, help='指定输出日志文件')
args = parser.parse_args()
start_time = time.time()
print("程序开始运行...")
rules = load_rules()
compiled_rules = {rule_name: re.compile(rule_pattern) for rule_name, rule_pattern in rules.items()}
config = load_config()
exclude_suffixes = config.get('excludeSuffix', '').split('|')
results = scan_directory(args.directory, compiled_rules, exclude_suffixes)
write_results_to_file(results, args.output_file, args.directory)
end_time = time.time()
total_time = end_time - start_time
print(f"程序运行结束,总运行时间: {total_time:.2f} 秒")
if __name__ == "__main__":
main()