Skip to content

Commit 243ad8a

Browse files
committed
Merge branch 'rename-map'
* rename-map: Preprocess: add unit tests for file renaming Preprocess: rename find_files_to_be_renamed Preprocess: automate renaming of case conflicts Preprocess: simplify file renaming
2 parents ffb943f + b77f6bf commit 243ad8a

File tree

3 files changed

+266
-75
lines changed

3 files changed

+266
-75
lines changed

commands/preprocess.py

Lines changed: 70 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,6 @@ def rearrange_archive(root):
8484
for fn in fnmatch.filter(os.listdir(root), 'cppreference-export*.xml'):
8585
os.remove(os.path.join(root, fn))
8686

87-
def add_file_to_rename_map(rename_map, dir, fn, new_fn):
88-
path = os.path.join(dir, fn)
89-
if not os.path.isfile(path):
90-
print("ERROR: Not renaming '{0}' because path does not exist".format(path))
91-
return
92-
rename_map.append((dir, fn, new_fn))
93-
9487
# Converts complex URL to resources supplied by MediaWiki loader to a simplified name
9588
def convert_loader_name(fn):
9689
if "modules=site&only=scripts" in fn:
@@ -106,56 +99,53 @@ def convert_loader_name(fn):
10699
else:
107100
raise Exception('Loader file {0} does not match any known files'.format(fn))
108101

109-
def find_files_to_be_renamed(root):
110-
# Returns a rename map: array of tuples each of which contain three strings:
111-
# the directory the file resides in, the source and destination filenames.
102+
def build_rename_map(root):
103+
# Returns a rename map: a map from old to new file name
104+
loader = re.compile(r'load\.php\?.*')
105+
query = re.compile(r'\?.*')
106+
result = dict()
112107

113-
# The rename map specifies files to be renamed in order to support them on
114-
# windows filesystems which don't support certain characters in file names
115-
rename_map = []
108+
# find files with invalid names -> rename all occurrences
109+
for fn in set(fn for _, _, filenames in os.walk(root) for fn in filenames):
110+
if loader.match(fn):
111+
result[fn] = convert_loader_name(fn)
116112

117-
files_rename = [] # general files to be renamed
118-
files_loader = [] # files served by load.php. These should map to
119-
# consistent and short file names because we
120-
# modify some of them later in the pipeline
113+
elif any((c in fn) for c in '?*"'):
114+
new_fn = query.sub('', fn)
115+
new_fn = new_fn.replace('"', '_q_')
116+
new_fn = new_fn.replace('*', '_star_')
117+
result[fn] = new_fn
121118

119+
# find files that conflict on case-insensitive filesystems
122120
for dir, _, filenames in os.walk(root):
123-
filenames_loader = set(fnmatch.filter(filenames, 'load.php[?]*'))
124-
# match any filenames with '?"*' characters
125-
filenames_rename = set(fnmatch.filter(filenames, '*[?"*]*'))
126-
127-
# don't process load.php files in general rename handler
128-
filenames_rename -= filenames_loader
129-
130-
for fn in filenames_loader:
131-
files_loader.append((dir, fn))
132-
for fn in filenames_rename:
133-
files_rename.append((dir, fn))
134-
135-
for dir, orig_fn in files_rename:
136-
fn = orig_fn
137-
fn = re.sub(r'\?.*', '', fn)
138-
fn = fn.replace('"', '_q_')
139-
fn = fn.replace('*', '_star_')
140-
add_file_to_rename_map(rename_map, dir, orig_fn, fn)
141-
142-
# map loader names to more recognizable names
143-
for dir, fn in files_loader:
144-
new_fn = convert_loader_name(fn)
145-
add_file_to_rename_map(rename_map, dir, fn, new_fn)
146-
147-
# rename filenames that conflict on case-insensitive filesystems
148-
# TODO: perform this automatically
149-
add_file_to_rename_map(rename_map, os.path.join(root, 'en/cpp/numeric/math'), 'NAN.html', 'NAN.2.html')
150-
add_file_to_rename_map(rename_map, os.path.join(root, 'en/c/numeric/math'), 'NAN.html', 'NAN.2.html')
151-
return rename_map
152-
153-
def rename_files(rename_map):
154-
for dir, old_fn, new_fn in rename_map:
121+
seen = dict()
122+
for fn in (result.get(s, s) for s in filenames):
123+
low = fn.lower()
124+
num = seen.setdefault(low, 0)
125+
if num > 0:
126+
name, ext = os.path.splitext(fn)
127+
# add file with its path -> only rename that occurrence
128+
result[os.path.join(dir, fn)] = "{}.{}{}".format(name, num + 1, ext)
129+
seen[low] += 1
130+
131+
return result
132+
133+
def rename_files(root, rename_map):
134+
for dir, old_fn in ((dir, fn) for dir, _, filenames in os.walk(root) for fn in filenames):
155135
src_path = os.path.join(dir, old_fn)
156-
dst_path = os.path.join(dir, new_fn)
157-
print("Renaming '{0}' to \n '{1}'".format(src_path, dst_path))
158-
shutil.move(src_path, dst_path)
136+
137+
new_fn = rename_map.get(old_fn)
138+
if new_fn:
139+
# look for case conflict of the renamed file
140+
new_path = os.path.join(dir, new_fn)
141+
new_fn = rename_map.get(new_path, new_fn)
142+
else:
143+
# original filename unchanged, look for case conflict
144+
new_fn = rename_map.get(src_path)
145+
if new_fn:
146+
dst_path = os.path.join(dir, new_fn)
147+
print("Renaming {0}\n to {1}".format(src_path, dst_path))
148+
shutil.move(src_path, dst_path)
159149

160150
def find_html_files(root):
161151
# find files that need to be preprocessed
@@ -172,7 +162,7 @@ def is_loader_link(target):
172162

173163
def transform_loader_link(target, file, root):
174164
# Absolute loader.php links need to be made relative
175-
abstarget = os.path.join(root, "common/" + convert_loader_name(target))
165+
abstarget = os.path.join(root, "common", convert_loader_name(target))
176166
return os.path.relpath(abstarget, os.path.dirname(file))
177167

178168
def is_ranges_placeholder(target):
@@ -201,20 +191,33 @@ def transform_ranges_placeholder(target, file, root):
201191
return os.path.relpath(abstarget, os.path.dirname(file))
202192

203193
def is_external_link(target):
204-
if re.match('(ht|f)tps?://', target):
205-
return True
206-
return False
194+
url = urllib.parse.urlparse(target)
195+
return url.scheme != '' or url.netloc != ''
196+
197+
def trasform_relative_link(rename_map, target, file):
198+
# urlparse returns (scheme, host, path, params, query, fragment)
199+
_, _, path, params, _, fragment = urllib.parse.urlparse(target)
200+
assert params == ''
201+
202+
path = urllib.parse.unquote(path)
203+
path = path.replace('../../upload.cppreference.com/mwiki/','../common/')
204+
path = path.replace('../mwiki/','../common/')
205+
206+
dir, fn = os.path.split(path)
207+
new_fn = rename_map.get(fn)
208+
if new_fn:
209+
# look for case conflict of the renamed file
210+
abstarget = os.path.normpath(os.path.join(os.path.dirname(file), dir, new_fn))
211+
new_fn = rename_map.get(abstarget, new_fn)
212+
else:
213+
# original filename unchanged, look for case conflict
214+
abstarget = os.path.normpath(os.path.join(os.path.dirname(file), path))
215+
new_fn = rename_map.get(abstarget)
216+
if new_fn:
217+
path = os.path.join(dir, new_fn)
207218

208-
def trasform_relative_link(rename_map, target):
209-
target = urllib.parse.unquote(target)
210-
for _, fn, new_fn in rename_map:
211-
target = target.replace(fn, new_fn)
212-
target = target.replace('../../upload.cppreference.com/mwiki/','../common/')
213-
target = target.replace('../mwiki/','../common/')
214-
target = re.sub(r'(\.php|\.css)\?.*', r'\1', target)
215-
target = urllib.parse.quote(target)
216-
target = target.replace('%23', '#')
217-
return target
219+
path = urllib.parse.quote(path)
220+
return urllib.parse.urlunparse(('', '', path, params, '', fragment))
218221

219222
# Transforms a link in the given file according to rename map.
220223
# target is the link to transform.
@@ -230,7 +233,7 @@ def transform_link(rename_map, target, file, root):
230233
if is_external_link(target):
231234
return target
232235

233-
return trasform_relative_link(rename_map, target)
236+
return trasform_relative_link(rename_map, target, file)
234237

235238
def has_class(el, *classes_to_check):
236239
value = el.get('class')

preprocess.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def main():
3838

3939
preprocess.rearrange_archive(root)
4040

41-
rename_map = preprocess.find_files_to_be_renamed(root)
42-
preprocess.rename_files(rename_map)
41+
rename_map = preprocess.build_rename_map(root)
42+
preprocess.rename_files(root, rename_map)
4343

4444
# clean the html files
4545
file_list = preprocess.find_html_files(root)

0 commit comments

Comments
 (0)