forked from Khan/khan-exercises
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlint_i18n_strings.py
executable file
·1907 lines (1506 loc) · 74.3 KB
/
lint_i18n_strings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
"""An i18n linting tool for exercises.
Catches common i18n problems in exercises and recommends solutions to
fix them. Wherever possible the script can also automatically fix the
problems with limited user interaction.
By default the script acts as a linter outputting errors that must be
corrected by hand and a list of any errors that could be fixed
automatically (which is still considered to be a linting failure).
If run with the --fix flag then the script will automatically fix the
files in place wherever possible. The user will be prompted to clear
up any ambiguities as well.
"""
import argparse
import copy
import re
import sys
import collections
# TODO(csilvers): replace lxml with HTMLParser (like extract_strings.py does)
import lxml.html
import lxml.html.html5parser
# Should the user be prompted when a case is ambiguous?
SHOW_PROMPT = True
# Should we output an error for ambiguous plurals?
ERROR_AMBIGUOUS_PLURALS = False
# Used to cache the results from the user prompt pluralization methods
_PLURAL_FORMS = {
# Hardcode a few common pluralizations in
'': 's',
'is': 'are',
'was': 'were'
}
_PLURAL_NUM_POS = {}
_IS_PLURAL_NUM = {}
# A list of all the built-in functions which are sometimes pluralized
# We effectively treat these as strings since their pluralization is
# already taken care of in word-problems.js
_functions = ['deskItem', 'exam', 'item', 'storeItem', 'crop', 'distance',
'exercise', 'pizza', 'animal', 'fruit', 'group', 'clothing']
# In an ambiguous case the presence of these strings tend to indicate
# what the variable holds
_string_vars = ['TEXT', 'TYPE', 'UNIT', 'LOCATION']
_num_vars = ['NUM', 'AMOUNT', 'TOTAL']
# Helper regexs for determining if something looks like a string
# or a function call
_STRING_RE = re.compile(r'^\s*["\'](.*?)["\']\s*$')
_FUNCTION_RE = re.compile(r'^\s*(\w+)\(.*\)\s*$')
# We're looking for all nodes that have non-whitespace text inside of them
# as a direct child node. Additionally we make sure the node isn't inside
# of a node that matches the same criteria.
_HAS_TEXT = './text()[normalize-space(.)!=""]'
_XPATH_FIND_NODES = '//*[%s][not(ancestor::*[%s])]' % (_HAS_TEXT, _HAS_TEXT)
# All the tags that we want to make sure that strings don't contain
_REJECT_NODES = [
'style',
'script',
'div[@class="validator-function"]',
'*[contains(@data-type,"regex")]',
'*[contains(@class,"graphie")]',
'*[contains(@class,"guess")]'
]
# Nodes that should not be inside another extracted node
_CANNOT_CONTAIN_NODES = ['p', 'div']
# Script nodes that might be contained within an extracted string
_INLINE_SCRIPT_NODES = [
'var',
'code'
]
# Nodes that might contain \text{} strings
_TEXT_NODES = [
'//code',
'//var',
'//script',
'//*[contains(@class,"graphie")]',
'//*[contains(@class,"validator-function")]',
]
# <var> nodes that might contain $._ strings
_VAR_NODES = [
'//var[not(ancestor::*[contains(@class,"vars")])]',
]
# All the tags that we want to ignore and not extract strings from
_IGNORE_NODES = _REJECT_NODES + _INLINE_SCRIPT_NODES
_ENTITY_TABLE = {
"\xc2\xa0": " ",
}
# Entities that should be cleaned up when they're set as the condition
# in an data-if attribute
_CLEAN_ENTITIES = {
'&': '&',
'<': '<',
'>': '>'
}
# Make an HTML 5 Parser that will be used to turn the HTML documents
# into a usable DOM. Make sure that we ignore the implied HTML namespace,
# and make sure we always read input files as utf-8.
class HTMLParser(lxml.html.html5parser.HTMLParser):
def __init__(self, *args, **kwargs):
kwargs.setdefault('namespaceHTMLElements', False)
super(HTMLParser, self).__init__(*args, **kwargs)
def parse(self, *args, **kwargs):
kwargs.setdefault('encoding', 'utf-8')
return super(HTMLParser, self).parse(*args, **kwargs)
PARSER = HTMLParser()
def main():
"""Handle running this program from the command-line."""
# Handle parsing the program arguments
arg_parser = argparse.ArgumentParser(
description='Extract translatable strings from HTML exercise files.')
arg_parser.add_argument('html_files', nargs='+',
help='The HTML exercise files to extract strings from.')
arg_parser.add_argument('--quiet', '-q', action='store_true',
help='Do not emit status to stderr on successful runs.')
arg_parser.add_argument('--fix', action='store_true',
help='Automatically fix some i18n issues in the input files.')
args = arg_parser.parse_args()
# Don't prompt the user if we're not fixing the results
if not args.fix:
global SHOW_PROMPT
SHOW_PROMPT = False
# Keep track of how many errors and fixes occur and in how many files
total_errors = 0
total_error_files = 0
total_fixes = 0
total_fix_files = 0
# Go through all the fileanmes provided
for filename in args.html_files:
# Lint the file, returns a list of error messages and
# a count of the number of fixes that were automatically
# applied (depending upon --fix)
(errors, num_fixes) = lint_file(filename, args.fix, not args.quiet)
# Keep track of how many files have been fixed
if num_fixes:
# Keep track of how many fixes have been done
total_fixes += num_fixes
total_fix_files += 1
if errors:
num_errors = len(errors)
# Keep track of how many errors have occurred
total_errors += num_errors
# Keep track of how many files have errors
total_error_files += 1
# Print out a notice indicating that an error occurred
# in that file.
print >>sys.stderr, ('%s error%s: %s.' % (
num_errors, "" if num_errors == 1 else "s", filename))
# Print out all the error messages
for error_msg in errors:
print >>sys.stderr, error_msg
# If nodes were automatically fixed output the result
if not args.quiet and args.fix and num_fixes:
print >>sys.stderr, ('%s node%s have been fixed in %s.' % (
num_fixes, "" if num_fixes == 1 else "s", filename))
# Output the results of having fixed the files automatically
if not args.quiet and args.fix:
print >>sys.stderr, ('%s nodes fixed in %s file%s.' % (
total_fixes, total_fix_files, "" if total_fix_files == 1 else "s"))
# Output a total number of errors that have occurred
if total_errors:
print >>sys.stderr, ('%s error%s detected in %s file%s.' % (
total_errors, "" if total_errors == 1 else "s",
total_error_files, "" if total_error_files == 1 else "s"))
sys.exit(min(total_errors, 127))
def lint_file(filename, apply_fix, verbose):
"""Fix a single HTML exercise repairing invalid nodes.
Returns an array of node tuples which cannot be fixed automatically and
must be fixed by hand. Nodes that can be fixed automatically are fixed
and the file is updated, if apply_fix is set to True.
Arguments:
- filename: A string filename to parse
- apply_fix: If True, then filename is replaced with new contents,
which is the fixed version of the old contents.
- verbose: If there should be any output
Returns:
- A tuple (errors, num_nodes_changed) which contains `errors`,
which is a list holding strings describing errors found in the file
and `num_nodes_changed` which is a number counting how many nodes
were changed by the script (or could've been changed, if the
apply_fix flag is set to False).
"""
# A list of all the errors that occurred
errors = []
# Keep track of how many nodes have changed in the document
# (Used to figure out if we need to write out a new version of the file)
nodes_changed = 0
# The filters through which the files should be passed and in which order
filters = [PronounFilter, TernaryFilter, AlwaysPluralFilter, PluralFilter,
AnFilter]
if ERROR_AMBIGUOUS_PLURALS:
filters.append(AmbiguousPluralFilter)
# Collect all the i18n-able nodes out of file
html_tree = lxml.html.html5parser.parse(filename, parser=PARSER)
nodes = _extract_nodes(html_tree, filename)
# Root HTML Tree
root_tree = nodes[0].getroottree() if nodes else None
# Do a first pass linting against the file. This looks for rejected nodes
# inside of extracted strings. For example, if a graphie element is in
# an extracted string that is an error and the code needs to be fixed.
# Nodes that should not be within the node
bad_nodes = _REJECT_NODES + _CANNOT_CONTAIN_NODES
# Construct an XPath expression for finding rejected nodes
lint_expr = "|".join([".//%s" % name for name in bad_nodes])
for node in nodes:
# If we're linting the file and the string doesn't contain any
# rejected nodes then we just ignore it
lint_nodes = node.xpath(lint_expr)
for lint_node in lint_nodes:
errors.append("Contains invalid node:\n%s\nInvalid node:\n%s" % (
_get_outerhtml(node), _get_outerhtml(lint_node)))
# A second kind of error is when we have an <span
# data-if="isSingular(...)"> that doesn't have natural language
# (or another isSingular) inside it -- that is, it has nested
# <span> tags instead. This isn't allowed: isSingular needs to be
# at the lowest level of nesting.
_IS_SINGULAR = '*[contains(@data-if,"isSingular")]'
# "isSingular" nodes which don't contain text directly and don't
# contain at least one "isSingular" child node.
search_expr = '//%s[not(%s)][not(./%s)]' % (_IS_SINGULAR, _HAS_TEXT,
_IS_SINGULAR)
non_bottom_level_issingular = html_tree.xpath(search_expr)
for lint_node in non_bottom_level_issingular:
errors.append("'isSingular' nodes must contain text directly;"
" distribute this node into its children:\n%s" % (
_get_outerhtml(lint_node)))
# And now we run the nodes through all of our fixable filters. These
# filters detect nodes that can be automatically fixed (and fixes them
# if the apply_fix flag is set to True). It also detects nodes that
# should be fixed but need some manual adjustment before they can be
# automatically fixed. Those come up as errors.
# Process the file with each filter in series
for filter_class in filters:
# Instantiate the filter
filter = filter_class()
# Have it process all the nodes in the document
(new_nodes, new_errors, new_nodes_changed) = filter.process(nodes)
# It's possible that the nodes will change, be replaced, or be inserted
# during the processing of the filter. To avoid having to re-load and
# parse the file a second time we build a list of nodes dynamically
# from the filtered results.
nodes = new_nodes
# Add any errors onto the full list of errors
errors += new_errors
# Keep track of how many nodes have changed
# (or would have changed, if apply_fix is False)
nodes_changed += new_nodes_changed
# Manually pluck out the <var>s to check for $._
text_nodes = root_tree.xpath('|'.join(_VAR_NODES))
filter = StringInVarFilter()
(new_nodes, new_errors, new_nodes_changed) = filter.process(text_nodes)
nodes = new_nodes
errors += new_errors
nodes_changed += new_nodes_changed
# Manually pluck out the code/javascript nodes for \text{} processing
text_nodes = root_tree.xpath('|'.join(_TEXT_NODES))
filter = MathJaxTextFilter()
(new_nodes, new_errors, new_nodes_changed) = filter.process(text_nodes)
nodes = new_nodes
errors += new_errors
nodes_changed += new_nodes_changed
if nodes_changed:
# If any nodes have changed and we want to apply the fixes
if apply_fix:
# Then write out the modified file
with open(filename, 'w') as f:
f.write(get_page_html(root_tree))
else:
# Consider it to be an error when there are nodes that need
# fixing and we haven't run with --fix
errors.append(('%s node%s need to be fixed. '
'Re-run with --fix to automatically fix them.' % (
nodes_changed, "" if nodes_changed == 1 else "s")))
return (errors, nodes_changed)
class BaseFilter(object):
"""A base filter, replaces nodes and <var> elements.
Sub-classes must define the following:
- xpath: A string that holds the XPath expression for finding nodes.
- filter_var: A method for processing a single fixable <var>.
Returns False if the filtering proved to be a noop, otherwise True.
- get_match: A method returning True if a <var> matches
"""
def __init__(self):
"""Intitialize and keep track of nodes_changed and errors."""
self.nodes_changed = 0
self.errors = []
def process(self, nodes):
"""Process all the nodes in the document.
Returns a tuple of the resulting nodes, a list of error strings, and
a number indictating how many nodes have changed.
"""
# It's possible that the nodes will change, be replaced, or be inserted
# during the processing of the filter. To avoid having to re-load and
# parse the file a second time we build a list of nodes dynamically
# from the filtered results.
new_nodes = []
for node in nodes:
# Process a single node
result = self.process_node(node)
# It's possible that multiple nodes have been returned, if that's
# the case then we extend the list
if isinstance(result, (tuple, list)):
new_nodes.extend(result)
# Otherwise we just append the node to the list
else:
new_nodes.append(result)
return (new_nodes, self.errors, self.nodes_changed)
def process_node(self, orig_node):
"""Process a single node.
Bail if the node doesn't contain any elements that may need
fixing. (We discard the results of running this against the
original node as we really want the result from the cloned
node.) Unfortunately cloning the nodes is a more-expensive
operation than running the XPath expression so we do this
first to offset the expense.
Returns the existing node or the modified node, if need be.
"""
if not self.find_fixable_vars(orig_node):
return orig_node
# Copy the existing node and make a new one, if need be
node = self.copy_node(orig_node)
# A collection of all the <var>s under this node that might need fixing
self.fixable_vars = self.find_fixable_vars(node)
# Process the fixable vars in the node
if not self.process_vars(self.fixable_vars):
return orig_node
# Replace orig_node with the new node we've generated, in the html tree
return self.replace_node(orig_node, node)
def find_fixable_vars(self, node):
"""Locate all the <var> elements that need fixing in a node.
Returns a list of nodes.
"""
# Construct an XPath expression for finding nodes to fix
fix_expr = '|'.join(['.//%s[%s]' % (name, self.xpath)
for name in _INLINE_SCRIPT_NODES])
return node.xpath(fix_expr)
def copy_node(self, orig_node):
"""Create a copy of the node for further processing.
Returns a copied version of the node.
"""
# We copy the node to make sure we don't unintentionally modify
# the original node.
return copy.deepcopy(orig_node)
def replace_node(self, orig_node, node):
"""Replace a node if we've generated a new node."""
# We just replace the node with the newly-cloned node
if orig_node != node:
orig_node.getparent().replace(orig_node, node)
return node
def process_vars(self, fixable_vars):
"""Process all the <var> elements in a node.
Returns True if no errors were found.
"""
# Loop through the fixable var nodes
for var_node in fixable_vars:
# Extract parts of the code element's inner contents for
# further processing.
match = self.get_match(var_node)
if match:
# Process the fixable var
if self.filter_var(match, var_node):
# Keep a tally of nodes that've been changed
self.nodes_changed += 1
return True
def get_match(self, var_node):
raise NotImplementedError('Subclasses must define this')
def filter_var(self, var_node):
raise NotImplementedError('Subclasses must define this')
class IfElseFilter(BaseFilter):
"""A filter for handling the generation of data-if/data-else nodes.
This builds off of BaseFilter and modifies it in some critical ways:
- The contents of <var> elements are inspected to extract unique keys
upon which a data-if condition should be built.
- If more than one key is detected then an error is generated.
- A new, cloned, node is generated to hold the contents of the data-else
element and its contents.
- If the node already has a data-if or data-else attribute then new inner
nodes are generated instead.
Sub-classes need to implement:
- extract_key: A method for pulling a unique key from a match.
- get_condition: A method that returns the condition to add to the node.
"""
# Keep track of node class names which should should not be directly
# modified, in which only an inner <span> should be used.
_blacklist_classes = ['problem', 'question']
def process_node(self, orig_node):
"""Process a single node.
Generates a clone of the node to be used to hold the data-else portion
of the result. Also checks the keys detected to see if there are any
problems. Finally, adds in the data-else and injects the cloned node.
Returns the existing node or the modified node, if need be. Could also
return a tuple of nodes
"""
# Create a cloned copy of the node, we're going to need this as
# the fixer will likely need to generate a second copy of the
# original node (for the 'data-else') but slightly modified.
self.cloned_node = copy.deepcopy(orig_node)
self.cloned_node.tail = ''
# The vars that might need fixing under the cloned element
self.cloned_vars = self.find_fixable_vars(self.cloned_node)
# Process the node using the BaseFilter
node = super(IfElseFilter, self).process_node(orig_node)
# There's a reason for ignoring the node so we just end early
if node is orig_node:
return orig_node
# If we've located more than one key then we need to fix the
# strings by hand.
if len(self.match_keys) > 1:
self.errors.append("Contains too many different keys (%s):\n%s" % (
", ".join(self.match_keys), _get_outerhtml(orig_node)))
return orig_node
# Only continue if there are keys to process
if self.match_keys:
# Get the one remaining key
key = self.match_keys[0]
# Add an if condition to the node
node.set('data-if', self.get_condition(key))
# Add the data-else attribute to the cloned node
self.cloned_node.set('data-else', '')
# And insert it after the original node
node.addnext(self.cloned_node)
# Keep track of nodes that've been changed
self.nodes_changed += 1
# Return both nodes for futher processing
return (node, self.cloned_node)
return node
def _get_cloned_var(self, var_node):
"""Given a <var> node return the equivalent node from the cloned node.
This is used to make it easy to work with the two sets of nodes
simultaneously.
"""
return self.cloned_vars[self.fixable_vars.index(var_node)]
def process_vars(self, fixable_vars):
"""Extract the keys from all the <var>s in the node."""
# Some nodes will have a unique 'key' which will be used as a
# lookup. For example in the following nodes:
# <p><var>He(1)</var> threw a ball to <var>his(1)</var>
# friend.</p>
# <p><var>He(1)</var> threw a ball to <var>him(2)</var>.</p>
# The first string has one key '1' used twice, whereas the second
# string has two keys '1' and '2'. We keep track of this because
# we need to use this key to generate the replacement string and
# also to make sure that we don't attempt to fix a string that has
# more than one key in it. For example the first string becomes:
# <p data-if="isMale(1)">He threw a ball to his friend.</p>
# <p data-else>She threw a ball to her friend.</p>
# And the second one is not possible to automatically fixable
# because it has more than one key.
match_keys = set()
for var_node in fixable_vars:
# Extract parts of the code element's inner contents for
# further processing.
match = self.get_match(var_node)
if match:
# Extract the key from the string (if it exists)
key = self.extract_key(match)
# If a key was extracted then add it to the set
if key:
match_keys.add(key)
self.match_keys = list(match_keys)
# Run the BaseFilter process_vars
return super(IfElseFilter, self).process_vars(fixable_vars)
def can_have_if(self, node):
"""Determines if a node can be modified using a data-if/data-else.
Nodes that already have a data-if or data-else should not be modified
and we end up wrapping the inner contents of the node instead.
Additionally certain blacklisted nodes should not be modified, such as
'question' or 'problem' nodes, of which there should only be one.
Returns True if the node can be modified, False if not.
"""
# Get the class from the element
className = node.get('class')
# If the node has a class and contains one of the blacklisted names
# then we immediately fail.
if className:
for banned_class in self._blacklist_classes:
if banned_class in className:
return False
# Otherwise we fail if a data-if or data-else exists
return node.get('data-if') is None and node.get('data-else') is None
def replace_node(self, orig_node, node):
"""Replace the node only if it can have a data-if added to it.
This is because nodes that have a data-if or data-else are left
in-place and new wrappers were generated in copy_node.
"""
# Don't replace if no matching <var> nodes were found
if not self.match_keys:
return orig_node
if self.can_have_if(orig_node):
return super(IfElseFilter, self).replace_node(orig_node, node)
else:
# Remove all child nodes within the original element
for child_node in orig_node.iterchildren():
orig_node.remove(child_node)
# Clear any remaining text
orig_node.text = ''
# And insert the newly-created node into position
orig_node.append(node)
return node
def copy_node(self, orig_node):
"""Copy the node only if it can't have an data-if added to it.
We leave nodes that have a data-if or data-else in-place and new
<span> wrappers are generated instead.
"""
if self.can_have_if(orig_node):
# Run the BaseFilter copy_node
return super(IfElseFilter, self).copy_node(orig_node)
else:
# We clone the node to make sure we don't unintentionally modify
# the original node.
node = copy.deepcopy(orig_node)
node.tail = ''
# Change the tag names to just be a boring 'span'
node.tag = self.cloned_node.tag = 'span'
# Remove all existing attributes on both the original and the
# cloned node
for attr in node.attrib:
node.attrib.pop(attr)
for attr in self.cloned_node.attrib:
self.cloned_node.attrib.pop(attr)
# Set the data-unwrap attribute to get the exercise framework
# to automatically remove the <span> wrapper that we added
node.set('data-unwrap', '')
self.cloned_node.set('data-unwrap', '')
return node
def get_condition(self, key):
raise NotImplementedError('Subclasses must define this')
def extract_key(self, match):
raise NotImplementedError('Subclasses must define this')
class PronounFilter(IfElseFilter):
"""Repairs usage of he()/He()/his()/His() in exercise files.
Used by lint_file, automatically converts these methods into
a more translatable form.
For example given the following string:
<p><var>He(1)</var> threw it to <var>his(1)</var> friend.</p>
This filter will convert it into the following two nodes:
<p data-if="isMale(1)">He threw it to his friend.</p>
<p data-else>She threw it to her friend.</p>
Creating two distinct strings for each gender (greatly simplifying
the translation process).
"""
_pronouns = ['he', 'He', 'his', 'His']
_pronoun_map = {'he': 'she', 'He': 'She', 'his': 'her', 'His': 'Her'}
_pronoun_condition = 'isMale(%s)'
# Matches he|his(...)
_regex = re.compile(r'^\s*(he|his)\(\s*(.*?)\s*\)\s*$', re.I)
xpath = ' or '.join(['contains(text(),"%s(")' % pronoun
for pronoun in _pronouns])
def get_match(self, fix_node):
"""Return a match of a string that matches he|his(...)"""
return self._regex.match(fix_node.text)
def extract_key(self, match):
"""From the match return the key of the string.
For example with: he(1), '1' would be returned.
"""
return match.group(2)
def filter_var(self, match, var_node):
"""Replace the fixable node with the correct gender string.
For example: <var>He(1)</var> will be 'He' and 'She' in the
original and cloned nodes.
"""
_replace_node(var_node, match.group(1))
_replace_node(self._get_cloned_var(var_node),
self._pronoun_map[match.group(1)])
return True
def get_condition(self, key):
"""Generates a data-if condition to handle the gender toggle.
This will turn a string like <p><var>He(1)</var> ran.</p> into:
<p data-if="isMale(1)">He ran.</p><p data-else>She ran.</p>
"""
return self._pronoun_condition % key
class AlwaysPluralFilter(BaseFilter):
"""Fix usage of plural() in exercises when the result is always plural.
For example the string <var>plural(distance(1))</var> will always return
the plural form of distance(1). We rewrite it to use a new method named
`plural_form` which will always return the plural form of that word.
There does exist some ambiguous cases and for those we need to prompt the
user to determine if we're dealing with a string or a number. For example
with the case: <var>plural(NUM)</var> the plural() method will return
an 's' if the number is greater than 1 or an empty string if it is 1.
Additionally sometimes the case of <var>plural("word")</var> was used,
which is silly, so we just replace it with the text "words".
"""
_empty_str_fn = '%s("", %s)'
# Map old function name to new function name
_function_map = {
'plural': 'plural_form(%s)',
'pluralTex': 'plural_form(%s)'
}
# Matches plural(...)
_regex = re.compile(r'^\s*(plural|pluralTex)'
r'\(\s*((?:[^,]+|\([^\)]*\))*)\s*\)\s*$', re.I)
xpath = ' or '.join(['contains(text(),"%s(")' % method
for method in _function_map])
def get_match(self, fix_node):
"""Return a match of a string that matches plural(...)"""
return self._regex.match(fix_node.text)
def filter_var(self, match, var_node):
"""Replace the <var> with the correct contents.
This depends upon the contents of the plural() string.
When the argument is a string literal. For example:
<var>plural("string")</var>
Will produce:
strings
When the variable holds a string. For example:
<var>plural(UNIT_TEXT)</var>
Will produce:
<var>plural_form(UNIT_TEXT)</var>
When the variable holds a number. For example:
<var>plural(NUM)</var>
Will produce:
<var>plural_form("", NUM)</var>
"""
# Handle the case where a raw string is used
str_match = _STRING_RE.match(match.group(2))
if str_match:
# In this case just convert it directly to its plural form
# We do this by prompting the user for help translating to the
# correct plural form.
_replace_node(var_node, get_plural_form(str_match.group(1)))
# If the argument is a number
elif get_is_plural_num(match):
# Then we need to rewrite the function call so that it'll
# be transformed into plural("", NUM), which will then be
# converted into its correct form via the PluralFilter
var_node.text = self._empty_str_fn % (match.group(1).strip(),
match.group(2).strip())
else:
# Make the string which will be used to wrap the output variable
# We mark ambiguous strings with an obvious function name
if _check_plural_is_ambiguous(match.group(2)):
pluralize = 'AMBIGUOUS_PLURAL(%s)'
else:
pluralize = self._function_map[match.group(1)]
# Otherwise we need to wrap the variable (or function call) in
# a call to plural_form() which will attempt to return the
# plural form of that string.
var_node.text = pluralize % match.group(2).strip()
return True
class PluralFilter(IfElseFilter):
"""Fix usage of plural() in exercises.
This filter fixes a number of different issues relating to the usage of
plural() in exercises. An interactive prompt is used to clear up any
information that can't be resolved automatically.
To start with it fixes the usage of two plural signatures.
The signature: <var>plural(STRING, NUM)</var> is pretty much left intact.
For example given the following string:
<p>I ran <var>NUM</var> <var>plural(distance(1), NUM)</var>.</p>
It will generate the following two strings:
<p data-if="isSingular(NUM)">I ran <var>NUM</var>
<var>distance(1)</var>.</p>
<p data-else>I ran <var>NUM</var>
<var>plural_form(distance(1), NUM)</var>.</p>
And given the following string:
<p>I ran <var>NUM</var> <var>plural(NUM, distance(1))</var>.</p>
It will generate the following two strings:
<p data-if="isSingular(NUM)">I ran <var>NUM</var>
<var>distance(1)</var>.</p>
<p data-else>I ran <var>NUM</var>
<var>plural_form(distance(1), NUM)</var>.</p>
(Note that the signature `plural(NUM, STRING)` outputs the number in
addition to the string itself.)
The tricky part about this ambiguous method signature is in figuring
out the different cases and how to resolve them. Let's step through each
possible case to show you how it's done.
The easiest case is when one of the arguments to the plural() function
is a string.
For example given the following string:
<p>I ran <var>plural(NUM, "mile")</var>.</p>
It will generate the following two strings:
<p data-if="isSingular(NUM)">I ran 1 mile.</p>
<p data-else>I ran <var>NUM</var> miles.</p>
The pluralization of the static string is done by prompting the user
for the correct plural form of the word.
The second most common case is in using one of the built-in string methods
such as `distance(POS)`, `item(POS)`, or `clothing(POS)`. We look for all
of these possible signatures and if one exists then we assume that that
is the string argument. The output will be the same as one of the above
examples.
The final case is when the arguments are truly ambiguous: When there is
no obvious way to detect if one argument is a string and one is a number.
We fix this by prompting the user for help in determining which argument
holds the number. With this information we can then easily resolve the
output (in a form similar to the handling of the built-in methods).
"""
# Map old function name to new function name
_function_map = {
'plural': 'plural_form(%s, %s)',
'pluralTex': 'plural_form(%s, %s)'
}
_ngetpos_condition = 'isSingular(%s)'
# See if it matches the form plural|pluralTex(..., ...)
_regex = re.compile(r'^\s*(plural|pluralTex)'
r'\(\s*((?:[^,(]+|\(.+?\))*),\s*((?:[^,(]+|\(.+?\))*)\s*\)\s*$', re.I)
xpath = ' or '.join(['contains(text(),"%s(")' % method
for method in _function_map.keys()])
def get_match(self, fix_node):
"""Return a match of a string that matches plural(...)"""
# See if it matches the form plural|pluralTex(..., ...)
return self._regex.match(fix_node.text)
def extract_key(self, match):
"""Extract a unique identifier upon which to toggle the plural form.
For the case of calls to plural() we need to determine which argument
is the number as that's the value upon which we must toggle.
"""
# Determine the position of the number argument and extract it
return match.group(get_plural_num_pos(match) + 1).strip()
def filter_var(self, match, var_node):
"""Replace the <var>s with the correct plural() contents.
If the first argument to plural() is a string:
This means that the plural() function is expected to output just
the pluralized form of the string itself. We take this and turn
it into two blocks toggled with an if/else and with the string
hardcoded into it. For example:
<p>I have <var>plural("a cat", NUM)</var>.</p>
Would then become (after user prompting):
<p data-if="isSingular(NUM)">I have a cat.</p>
<p data-else>I have many cats.</p>
If the second argument to plural() is a string:
This means that the plural() function is expected to output a
number and the pluralized form of the string. We take this and turn
it into two blocks toggled with an if/else and with the number
variable and the string hardcoded into it. For example:
<p>I have <var>plural(NUM, "cat")</var>.</p>
Would then become (after user prompting):
<p data-if="isSingular(NUM)">I have <var>NUM</var> cat.</p>
<p data-else>I have <var>NUM</var> cats.</p>
Otherwise both of the results are variables or function calls:
This means that we need to insert the variables directly, for example:
<p>I have <var>plural(NUM, item(1))</var>.</p>
Would then become (after user prompting):
<p data-if="isSingular(NUM)">I have <var>NUM</var>
<var>item(1)</var>.</p>
<p data-else>I have <var>NUM</var>
<var>plural_form(item(1), NUM)</var>.</p>
To do this we need to determine which argument is the number variable
and then change the output depending upon it (because of the silly
plural() function argument order).
"""
cloned_var = self._get_cloned_var(var_node)
first_str_match = _STRING_RE.match(match.group(2))
second_str_match = _STRING_RE.match(match.group(3))
# If the first argument is a string:
if first_str_match:
# Get the word out of the string
word = first_str_match.group(1).strip()
# Replace the first node with just the word
_replace_node(var_node, word)
# Replace the cloned node with the plural form of the word
_replace_node(cloned_var, get_plural_form(word))
# If the second argument is a string
elif second_str_match:
# Get the word out of the string
word = second_str_match.group(1).strip()
# Have the <var> output just the number
var_node.text = cloned_var.text = match.group(2).strip()
# Insert the word after the singular <var>
var_node.tail = ' ' + word + (var_node.tail or '')
# Insert a space and the plural form of the word after the variable
cloned_var.tail = (' ' + get_plural_form(word) +
(cloned_var.tail or ''))
# Otherwise both of the results are variables or function calls.
else:
# Get the position of the number variable from the match
plural_num_pos = get_plural_num_pos(match)
# Check to see if the argument holding the string is ambiguously
# named, and thus we need to mark it as such.
check_str = (match.group(3) if plural_num_pos == 1 else
match.group(2))
# Make the string which will be used to wrap the output variable
# We mark ambiguous strings with an obvious function name
if _check_plural_is_ambiguous(check_str):
pluralize = 'AMBIGUOUS_PLURAL(%s, %s)'
else:
pluralize = self._function_map[match.group(1)]
# Number is in the first position, this results in the output:
# "NUM STRING". This signature is deprecated so we're going to
# convert it into a more translatable form.
if plural_num_pos == 1:
# We're going to turn the following:
# <var>plural(NUM, STRING)</var>
# Into the following for the singular and plural cases:
# <var>NUM</var> <var>STRING</var>
# <var>NUM</var> <var>plural_form(STRING)</var>
# We start by replacing the contents of the node with just the
# STRING var text resulting in: <var>NUM_VAR</var>
var_node.text = cloned_var.text = match.group(2).strip()
# We want to generate HTML that looks like this:
# <var>NUM_VAR</var> <var>STRING_VAR</var>
# <var>NUM_VAR</var> <var>plural_form(STRING_VAR)</var>
# We need to insert a new <var> element after the existing one
singular_var_node = var_node.makeelement('var')
plural_var_node = cloned_var.makeelement('var')
# In the singular case we just output <var>STRING_VAR</var>
singular_var_node.text = match.group(3).strip()