-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStatuteItem.py
615 lines (570 loc) · 35 KB
/
StatuteItem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
# Copyright (C) 2022 Ian Caines
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import XMLStatParse
from Constants import sectionTypes, formulaSectionTypes, formulaSectionMap, textTypes, knownTextTags, textTriggers
import Constants
import SectionLabelLib
from ErrorReporter import showError
import textutil
from StatutePart import StatutePart
class StatuteException(Exception): pass
#####
#
# Items, which represent parts of the parsed statute structure
#
####
class BaseItem(StatutePart):
"""Superclass for all items in the statute text structure (*not* headings --- maybe I should rename it), with some general purpose methods of handling section labels, etc."""
def __init__(self, parent, tree, statute = None):
StatutePart.__init__(self,parent=parent,statute=statute)
self.tree = tree #the top node in the tree corresponding to this item
self.items = [] #list of immediate subitems for this item
return
def getStatute(self): return self.statute #statute with which item is associated
def getIndentLevel(self): return self.parent.getIndentLevel()
def itemIterator(self):
"""Returns an iterator over this item and all its subitems, depth first."""
yield self
for subitem in self.items:
for c in subitem.itemIterator(): yield c
pass
return
def getLocationString(self):
"""Location of a BaseItem is given by its sectionLabel."""
return self.getSectionLabel().getDisplayString()
def getSectionLabel(self):
"""Returns the sectionLabel of this object, or its parent if this item is not labeled.
@rtype: SectionLabelLib.SectionLabel
"""
if self.getImmediateSectionLabel() is not None: return self.getImmediateSectionLabel()
else: return self.parent.getSectionLabel()
def getImmediateSectionLabel(self):
"""Returns the section label if this particular item, or None if self is not itself labeled.
@rtype: SectionLabelLib.SectionLabel
"""
return None
def getRenderedText(self,renderContext,skipLabel=False,baseLevel=0):
"""Get the text for this item, rendered according to the provided context."""
paragraphs = self.getParagraphs(renderContext,skipLabel=skipLabel)
#merge paragraphs, where possible
mergedParagraphs = [paragraphs[0]]
for p in paragraphs[1:]:
if not mergedParagraphs[-1].merge(p): mergedParagraphs.append(p)
pass
return "\n".join(p.getRenderedText(baseLevel=baseLevel) for p in mergedParagraphs)
def getParagraphs(self, renderContext, skipLabel=False):
"""Get list of paragraph text-blocks for this item, rendered according to the current context. Gets overridden in certain subclasses to reflect different paragraph breakdown (e.g., in TextItems)"""
return self.getSubParagraphs(renderContext)
def getSubParagraphs(self,renderContext):
paragraphs = []
for c in self.items: paragraphs += c.getParagraphs(renderContext)
return paragraphs
def extractMetaData(self):
"""Extracts meta data from the item's tree, and returns a list of subnodes other than those providing metadata."""
return [item for item in self.tree]
def handleSubsections(self, subsecs):
"""Handles the subsections of the section that are left over after extractMetaData has done its work. Called by items that may have subsections."""
for child in subsecs:
if child.tag == "definition": self.items.append(DefinitionItem(parent=self,tree=child)) #this is first, so that definitions are parsed a DefinitionItems rather than generic SectionItems, despite being a sectionType
elif child.tag in sectionTypes: self.items.append(SectionItem(parent=self,tree=child)) #other types of section, include formuladefinition
elif child.tag in formulaSectionTypes: self.items.append(SectionItem(parent=self,tree=child))
elif child.tag == "formulagroup": self.items.append(FormulaItem(parent=self,tree=child)) #top level for a formula --- handled specially so we can extract the formula itself
elif child.tag == "provision": self.items.append(TextItem(parent=self,tree=child,forceNewParagraph=True)) #provision tags only appear in ITA 211.1, this provides an acceptable way of displaying them
elif child.tag == "readastext": self.items.append(ReadAsItem(parent=self,tree=child))
elif child.tag in textTypes: self.items.append(TextItem(parent=self,tree=child)) #tags that encapsulate only text
elif child.tag == "a":
txt = child.getRawText().strip().lower()
if txt != "previous version":
showError("Unknown <a> tag: ["+txt+"]",location=self)
pass
elif isinstance(child,XMLStatParse.TextNode): #raise an exception if we are ignoring any raw text
if child.getRawText().strip() != "": showError("Text appearing directly in a section: ["+child.getRawText()+"]",location=self)
else: showError("Unknown tag: [" + repr(child) + "]", location=self)
pass
return
def __repr__(self):
return "<Item: "+self.getRawText()+">"
def getInitialTextItem(self):
"""Returns the initial TextItem under this object. Useful for grabbing the applicability provisions in a definition section.
@rtype: TextItem"""
for item in self.itemIterator():
if isinstance(item,TextItem): return item
return None
def getRawText(self,limit=500):
"""Returns raw text of the item (used for debugging).
@rtype: str
"""
remainder = limit
l = []
for item in self.items:
l.append(item.getRawText(remainder))
remainder -= len(l[-1])
if remainder <= 0: break
pass
s = "".join(c for c in l)
return s[:limit]
def getMarginalList(self):
"""Returns a list of the string representation of the marginal notes for the section.
@rtype: str
"""
l = []
for item in self.itemIterator():
if isinstance(item,SectionItem):
l.append(item.getMarginalNote())
pass
pass
l = [c for c in l if c is not None]
return l
def getTitle(self,limit=100):
"""Returns a string summarizing the contents of the section, with maximum length limit.
@type limit: int
@rtype: str
"""
l = self.getMarginalList()
if len(l) == 0: return ""
s = " / ".join(l)
if len(s) > limit:
#TODO: force break at the final space in the title
s = s[:limit]
s = s + "..."
pass
return s
class SectionItem(BaseItem):
"""Class for a section / subsection / etc."""
def __init__(self, parent, tree, statute=None):
BaseItem.__init__(self,parent,tree,statute)
#TODO : extract the section label code from the tree, if present
self.finalizedLabel = False #Says whether label finalizer has run -- we should worry if it has yet there is no SectionLabel
if tree.labels == None: self.sectionLabel = None
else:
self.sectionLabel = None
try:self.sectionLabel = SectionLabelLib.SectionLabel(labelList=tree.labels) #contruct a SectionLabel object from the labels parameter of the node, if present
except Exception,e: showError("Error parsing sectionLabel: ["+ str(e) +"]",location=self)
#extract marginal note and label, if present
self.marginalNote = None
self.labelString = None #string labelling this particular element (e.g., "(ii.1)")
self.historicalNote = None
self.repealed = False
subsecs = self.extractMetaData() #fill in prior variables, leaving any remaining nodes to process
self.finalizeSectionLabel()
#handle other subitems, which should all be types of sections or blocks of text
self.handleSubsections(subsecs)
return
def extractMetaData(self):
"""Extract information on section label / marginal note, and returns the list of remaining subitems to be processed."""
subsecs = [] #TODO: factor this out into a method that can be overriden for definitions
for child in self.tree:
if child.tag == "marginalnote": self.marginalNote = child.getRawText().strip()
elif child.tag == "label": #the final mark for this section (e.g., (ii.1))
if self.labelString is not None: showError("Label encountered after another label. ["+ self.labelString +"]["+child.getRawText().strip()+"]",location=self)
self.labelString = child.getRawText().strip()
if len(subsecs) > 0: showError("Label encountered after other text ["+ self.labelString +"]["+str(subsecs)+"]",location=self)
elif child.tag == "formulaterm": #the letter being defined in a formula definition section
if self.labelString is not None: showError("Formula term label encountered after another label. ["+ self.labelString +"]["+child.getRawText().strip()+"]",location=self)
self.labelString = child.getRawText().strip()
if len(subsecs) > 0: showError("Formula term label encountered after other text ["+ self.labelString +"]["+str(subsecs)+"]",location=self)
elif child.tag == "historicalnote":
if self.historicalNote is not None: showError("Multiple historical notes",location=self)
#TODO: improve handling of historical notes -- give them their own items that parse the contents and generate paragraphs
self.historicalNote = child.getSpacedRawText().strip()
elif child.tag == "repealed":
self.repealed = True
subsecs.append(child) #don't ignore
elif isinstance(child,XMLStatParse.TextNode) and child.getRawText() == "": pass #ignore whitespace textnodes
else:
subsecs.append(child)
pass
return subsecs
def finalizeSectionLabel(self):
"""Method that verifies and/or sets the SectionLabel object for the section by looking at the parent section label and the labelString provided for this object. If the underlying node did not have a code attribute, a SectionLabel is simply constructed by appending the current label to the parent's SectionLabel."""
#create imputed SL from parent
selfType = self.tree.tag #derive the type of the new Numbering type to add to the label from the tag
if selfType in formulaSectionMap: selfType = formulaSectionMap[selfType]
if self.labelString is not None: cleanLabel = self.labelString.strip("().")
else: cleanLabel = u""
if u" to " in cleanLabel or u" and " in cleanLabel: cleanLabel = cleanLabel.split(" ")[0].strip("()") #if label string contains a connector, only look at first part (this typically happen for repealed groups of sections)
if self.parent is not None: imputedSL = self.parent.getSectionLabel().addLabel(selfType, cleanLabel)
else: imputedSL = SectionLabelLib.SectionLabel(labelList=[(selfType,cleanLabel)])
currentSL = self.getImmediateSectionLabel()
if currentSL is not None: #compare with SL derived from the xml tag, if one exists, and show error on mismatch
if not currentSL.quasiEqual(imputedSL):
showError("Inconsistent labelling, Current:["+currentSL.getDisplayString()+"] Imputed["+imputedSL.getDisplayString()+"]",location=self)
pass
else: #otherwise use the imputed SL
self.sectionLabel = imputedSL
self.finalizedLabel = True
return
def getMarginalNote(self):
return self.marginalNote
def getImmediateSectionLabel(self):
"""Returns this item's sectionLabel, or None if no label. Shows error if label has been finalized yet there is no sectionLabel.
@rtype: SectionLabelLib.SectionLabel"""
if self.sectionLabel != None: return self.sectionLabel #the section label object pinpointing this provision
if self.finalizedLabel:
showError("SectionItem lacking immediate label ["+self.tree.tag+"]", location = self.parent) #if label finalized, no reason not to have sectionLAbel
return None
def getLabelString(self): return self.labelString #the top-level string tag labeling this provision (appearing at the start of text)
def getIndentLevel(self):
sl = self.getSectionLabel()
if sl is None: return self.parent.getIndentLevel() #return the parent's level, if there's no section label here
return sl.indentLevel()
def getParagraphs(self,renderContext, skipLabel=False):
#paragraphs of a section consist of the marginal note, the label and paragraphs from any subobjects. Label is skipped if "skipLabel" is set to True
paragraphs = []
needForce = True #need to explicitly force a new paragraph on a subitem paragraph
if self.marginalNote is not None: paragraphs.append(Paragraph(text=self.marginalNote, renderContext=renderContext, isMarginalNote=True)); needForce = False
#add anchor
if not skipLabel:
anchor = renderContext.renderAnchor(self.statute.getStatuteData().getAnchor(self.getSectionLabel()))
needForce = False #because we've forced new paragraph with the anchor
paragraphs.append( Paragraph(text=anchor, renderContext=renderContext, indentLevel=self.getIndentLevel(),forceNewParagraph=True ) )
if self.getLabelString() is not None: paragraphs.append(Paragraph(text=renderContext.boldText(self.getLabelString()), renderContext=renderContext, indentLevel=self.getIndentLevel(), softSpace=True) )
paragraphs += self.getSubParagraphs(renderContext)
if self.historicalNote is not None: paragraphs.append(Paragraph(text=renderContext.newLine() + "HISTORICAL INFORMATION: " + self.historicalNote,renderContext=renderContext,forceNewParagraph=True,indentLevel=self.getIndentLevel()))
if needForce and len(paragraphs) > 0: paragraphs[0].forceNewParagraph = True #force a new paragraphs to start if not already accomplished by label string or marginal note
return paragraphs
def getRawText(self,limit = 500):
s = BaseItem.getRawText(self,limit=limit)
if self.getLabelString() is not None: s = self.getLabelString() + " " + s
return s[:limit]
pass
class DefinitionItem(SectionItem):
"""Special subclass for handling definitions.
(By overriding extractMetaData, provides special handling for the marginal notes, which have a different format within definitions, as well as for labels, which are indicated by a definedtermen tag within the definition)."""
def __init__(self, parent, tree):
SectionItem.__init__(self,parent, tree)
self.definedTerms = [] #collect list of all terms defined in this definition section
for item in self.items:
if isinstance(item,TextItem): self.definedTerms += [c for c in item.getDefinedTerms()]
pass
self.termDefined = None
self.verifyDefinition()
return
def verifyDefinition(self):
"""Internal method that reconciles the definitions indicated by the text and the SL. Sets the internal self.termDefined member that indicates which term is being defined by this item.
@rtype: None"""
#get definition from the SL
labelDef = None
if not self.sectionLabel.hasLastDefinition(): #show error if the sectionlabel doesn't reflect this as a definition
showError("SL does not end with definition, for DefinitionItem.", location=self)
elif self.sectionLabel.hasLastEmptyDefinition(): #show an error if we have an empty defined term in the sectionLabel (except for repealed provisions, which we don't really care about.
if "repealed" not in self.getRawText(limit=100).lower(): showError("Empty definition.", location=self)
else: labelDef = self.sectionLabel.getLastDefinitionString() #definition according to the SL
if labelDef is not None: labelDef = labelDef.lower()
#get definition from the text
textDef = None
if len(self.definedTerms) == 0:
showError("DefinitionItem with no definedTerms.", location=self)
pass
else:
if len(self.definedTerms) > 1:
#showError("Asking for defined terms in a DefinitionItem with multiple defined terms: "+ str(self.definedTerms), location=self) #these multiple definition are not unusual, better to check for consistency with labelling
pass
textDef = self.definedTerms[0]
pass
if textDef is not None: textDef = textDef.lower()
if labelDef is None and textDef is None:
showError("DefinitionItem with no definition specified in any manner.", location=self)
self.termDefined = ""
return
if labelDef is None: self.termDefined = textDef; return
if textDef is None: self.termDefined = labelDef; return
if labelDef != textDef: #show error if there is inconsistency between definitions.
#showError("Inconsistent label and text definition. txt:["+textDef+"] lbl:["+labelDef+"]",location=self)
if labelDef != textDef[:len(labelDef)]:
showError("Label definition is not stem of text definition. txt:["+textDef+"] lbl:["+labelDef+"]",location=self)
pass
pass
self.termDefined = textDef #but either way, go with text definition.
return
def extractMetaData(self):
"""Extract information on section label / marginal note, and returns the list of remaining subitems to be processed.
This overrides the normal meta-data extraction, since definitions shouldn't have labels but have differently tagged marginal notes.
"""
subsecs = [] #TODO: factor this out into a method that can be overriden for definitions
for child in self.tree:
if child.tag == "marginalnote":
tmp = child.englishMarginalText()
if tmp != None and self.marginalNote != None: showError("Multiple marginal notes: [" + self.marginalNote + "][" + tmp +"]",location=self)
self.marginalNote = tmp
#elif child.tag == "definedtermen": self.labelString = child.getRawText()
elif child.tag == "historicalnote": self.historicalNote = child.getRawText() #I don't think there should ever be historical notes to definition sections.
else:
subsecs.append(child)
pass
return subsecs
def getParagraphs(self,renderContext, skipLabel = False):
paragraphs = []
anchor = renderContext.renderAnchor(self.statute.getStatuteData().getAnchor(self.getSectionLabel()))
paragraphs.append( Paragraph(text=anchor, renderContext=renderContext, indentLevel=self.getIndentLevel(),forceNewParagraph=True ) )
paragraphs += self.getSubParagraphs(renderContext) #add in all text contained in the definitionItem
if len(paragraphs) > 0: paragraphs[0].forceNewParagraph = True #force first paragraph, if any, to start a new paragraph
return paragraphs
def getDefinedTerm(self):
"""Returns the first defined term in them item.
@rtype: str
"""
return self.termDefined
pass
class FormulaItem(BaseItem):
"""Top level item for a formulagroup node. Handles the initial formula. These items have "Formula" groups instead of Labels, and are at the same section label as preceding text (but force a new paragraph). The Formula sub-items are handled as ordinary sections."""
def __init__(self, parent, tree):
BaseItem.__init__(self,parent,tree)
self.marginalNote = None
self.formulaString = None
subsecs = self.extractMetaData()
self.handleSubsections(subsecs)
return
def getFormulaString(self):
if self.formulaString is None:
#showError("Formula without formula string",location=self) #this is not an error -- sometimes the variables are discussed in text, or the formula is given is a seperate part of the text from the variables.
return ""
return self.formulaString
def separateLabelLine(self): return True #the "label" of the formula should be pushed to its own line (as well as starting a new paragraph)
def extractMetaData(self):
"""Extract information on section label / marginal note, and returns the list of remaining subitems to be processed."""
subsecs = [] #TODO: factor this out into a method that can be overriden for definitions
for child in self.tree:
if child.tag == "marginalnote": self.marginalNote = child.getRawText().strip()
elif child.tag == "formula":
if self.formulaString is not None: showError("formulaString encountered after another. ["+ self.formulaString +"]["+child.getRawText().strip()+"]",location=self)
self.formulaString = child.getRawText().strip()
if len(subsecs) > 0: showError("formulaString encountered after other text ["+ self.formulaString +"]["+str(subsecs)+"]",location=self)
#elif child.tag == "historicalnote": self.historicalNote = child.getRawText().strip() #TODO: improve handling of historical notes!
elif isinstance(child,XMLStatParse.TextNode) and child.getRawText() == "": pass #ignore whitespace textnodes
else:
subsecs.append(child)
pass
pass
return subsecs
def getIndentLevel(self): return self.parent.getIndentLevel()
def getImmediateSectionLabel(self): return None
def getParagraphs(self,renderContext, skipLabel = False):
paragraphs = list()
paragraphs.append(Paragraph(text=renderContext.boldText(self.getFormulaString()), renderContext=renderContext,forceNewParagraph=True, indentLevel=self.getIndentLevel()) )
followers = self.getSubParagraphs(renderContext)
if len(followers) > 0: followers[0].forceNewParagraph = True
return paragraphs + followers
def getRawText(self,limit=500):
return self.formulaString
class ReadAsItem(BaseItem):
"""Class representing a read-as text block. """
def __init__(self, parent, tree):
BaseItem.__init__(self,parent,tree)
sections = self.extractSectionSubtree(tree) #the subtree of the sections being read-as
self.handleSubsections(sections)
return
def getSectionLabel(self): return self.parent.getSectionLabel()
def extractSectionSubtree(self,tree):
"""Returns the subtree of a readastext tree that contains section data."""
sectionPieces = []
sections = []
for node in tree:
if isinstance(node,XMLStatParse.TextNode):
if node.getRawText().strip() != "": showError("Text found in a readastext: ["+node.getRawText()+"]",location=self)
elif node.tag == "sectionpiece": sectionPieces.append(node)
else:
if node.tag in sectionTypes or node.tag == "formulagroup": sections.append(node)
else: showError("Bad node found in readastext: ["+node.getTag()+"]",location=self)
pass
if len(sectionPieces) > 1: showError("Multiple sectionpieces found in readastext: ["+ str(len(sectionPieces))+"]",location=self)
elif len(sectionPieces) == 0:
if len(sections) > 0:
showError("No sectionpieces found in readas, but direct sections found",location=self)
return sections
else: showError("Nothing found in readastext",location=self)
if len(sections) > 0: showError("Both sections and sectionpieces found in readastext",location=self)
sectionPiece = sectionPieces[0]
sections = []
for node in sectionPiece:
if isinstance(node,XMLStatParse.TextNode):
if node.getRawText().strip() != "": showError("Text found in a sectionpiece: ["+node.getRawText()+"]",location=self)
elif node.tag in sectionTypes: sections.append(node)
elif node.tag == "formulagroup" or node.tag == "provision": sections.append(node)
else: showError("Bad node found in sectionpiece: ["+node.getTag()+"]",location=self)
pass
if len(sections) < 1: showError("No sections found in sectionpiece: ["+ str(len(sections))+"]",location=self)
return sections
class TextItem(BaseItem):
"""Class for a blob of text, possibly with embedded links and other decorations. Is called on nodes of the tree which just embed text, and not further subsection.
Text inside the TextItem is stored as a linked list of Piece objects."""
def __init__(self,parent,tree,forceNewParagraph = False):
BaseItem.__init__(self,parent,tree)
self.forceNewParagraph = forceNewParagraph #force this TextItem to start a new paragraph
self.firstPiece = textutil.Piece(self,isSpaced=False) #dummy piece to start linked list
self.lastPiece = self.firstPiece
self.processTree(self.tree)
self.decoratedText = self.firstPiece.assembleText()
#self.text, self.decorators = self.firstPiece.assembleText()
self.definedTerms = self.decoratedText.getDefinedTerms() #list of defined terms appearing in this text block
#TODO: extract defined terms from the applicable decorators
return
@staticmethod
def isWrittenText(stack):
"""Returns True if the item contains any text that should be visible in the output."""
for tag in stack:
if tag in textTriggers: return True
return False
def addPiece(self,piece):
"""Adds a new piece after the current last piece."""
self.lastPiece.setNextPiece(piece)
self.lastPiece = piece
return
def processTree(self,tree,stack=None):
if stack is None: stack = [] #create stack on initial call
if len(stack) > 100: raise StatuteException("Stackoverflow")
stack.append(tree.tag)
for item in tree: #iterate over the subitems
if item.tag == "definedtermen":
self.addPiece(textutil.DefinedTermPiece(parent=self,text=item.getSpacedRawText().strip()))
elif item.tag == "xrefexternal":
self.addPiece(textutil.LinkPiece(parent=self,text=item.getSpacedRawText(),pinpoint=None))
elif item.tag =="xrefinternal":
self.addPiece(textutil.LinkPiece(parent=self,text=item.getSpacedRawText(),pinpoint=None))
elif isinstance(item,XMLStatParse.TextNode): #TextNode correspond to text in the xml file. Only include if we are inside aof <Text> tags.
txt = item.getRawText().strip() #to strip off leading/trailing spaces / new lines
if txt == "": continue
if self.isWrittenText(stack): self.addPiece(textutil.TextPiece(parent=self,text=txt))
else:
showError("Unprocessed text: [TXT: "+ txt + "][STACK: "+str(stack)+"]",location=self) #if we are ignoring non-trivial text, raise an exception so we know there is more to handle.
pass
elif item.tag in sectionTypes or item.tag in formulaSectionTypes: showError("Found a section label in text: ["+item.tag+"]",location=self)
else:
if item.tag not in knownTextTags: showError("Unknown tag found in text: ["+item.tag+"]", location=self)
self.processTree(tree=item,stack=stack) #otherwise recurse down to the contents of this item.
stack.pop()
return
def getText(self):
"""Calls the getText method on the underlying DecoratedText object, returning the plain text contents.
@rtype: DecoratedText.DecoratedText"""
return self.decoratedText.getText()
def getDecoratedText(self):
"""
@rtype: DecoratedText.DecoratedText
"""
return self.decoratedText
def getRenderedText(self, renderContext,skipLabel=False,baseLevel=0):
"""Calls the getDecoratedText method on the underlying DecoratedText object, with the supplied RenderContext."""
return self.decoratedText.getRenderedText(renderContext)
def getParagraphs(self, renderContext, skipLabel=False):
"""Return the rendered text of this item bundled into a list of Paragraph objects."""
indentLevel = self.getIndentLevel()
return [Paragraph(text=self.getRenderedText(renderContext),renderContext=renderContext, indentLevel=indentLevel,
forceNewParagraph=self.forceNewParagraph)]
def getDefinedTerms(self):
return self.definedTerms
def getRawText(self, limit=500):
return self.decoratedText.getText()[:limit]
#####
#
# Object of handling headings (Parts, Divisions, Subdivisions of statutes)
#
#####
class HeadingItem(StatutePart):
def __init__(self, parent=None, statute=None, tree=None):
StatutePart.__init__(self, parent=parent, statute=statute) #Heading items do not have parents, just the statute
if tree is None: raise StatuteException("No tree provided to HeadingItem")
self.tree = tree
self.titleString = None
self.labelString = None # Label assigned to this heading (part/division/etc.)
self.numbering = None # SegmentNumbering for this segment (only non-None if labeled)
self.processHeadingData()
self.confirmLabel()
return
def processHeadingData(self):
"""Extracts heading information from the tree of the heading node."""
subsecs = [] # TODO: factor this out into a method that can be overriden for definitions
for child in self.tree:
if child.tag == "label":
self.labelString = child.getRawText().strip()
pass
elif child.tag == "titletext":
self.titleString = child.getRawText().strip()
pass
elif isinstance(child, XMLStatParse.TextNode) and child.getRawText() == "": pass # ignore whitespace textnodes
else:
subsecs.append(child)
pass
if len(subsecs) > 0: showError("Excess nodes in headingitem: [" + str(subsecs) + "]")
return
def confirmLabel(self):
"""Confirm that the label seen on the item is consistent with the information in the tree's labels value,
and creates a numbering for the heading, if so. If not, show an error."""
#confirm that we have a valid segmentType and create SegmentNumbering object
if self.labelString is None: return
l = self.labelString.split()
if len(l) != 2: showError("Incorrect number of pieces in heading label: ["+self.labelString+"]", location=self); return
segmentType = l[0].lower().strip()
segmentLabel = l[1]
if segmentType not in Constants.segmentTypes: showError("Unknown segment type for heading: ["+self.labelString+"]", location=self); return
self.numbering = SectionLabelLib.SegmentNumbering(segmentType = segmentType,labelString=segmentLabel)
#cross-check against the labels parameter of the tree
l = self.tree.labels
if l is None: showError("No labels parameter on heading node: ["+self.labelString+"]", location=self)
if (segmentType == "part" and len(l) == 2 and l[0][0] == "ga") or (segmentType == "division" and len(l) == 3 and l[0][0] == "ga" and l[1][0] == "gb") or (segmentType == "subdivision" and len(l) == 4 and l[0][0] == "ga" and l[1][0] == "gb" and l[2][0] == "gc"): pass
else: showError("Inconsistency with reported heading in labels parameter (not expected segments) ["+self.tree.labels+"]["+self.labelString+"]",location=self)
if l[-2][1].split("_")[1].lower() != segmentLabel.lower(): #check part after "_" in the second last label value
showError("Inconsistency with reported heading in labels parameter (segment label does not match) ["+self.tree.labels+"]["+self.labelString+"]",location=self)
return
def isLabeled(self):
"""Returns True if this HeadingItem has a formal label (part, division, etc.), as opposed to simply being floating text."""
if self.labelString is not None: return True
return False
def getNumbering(self):
return self.numbering
def getTitleString(self):
return self.titleString
def getLabelString(self):
return self.labelString
def getLocationString(self):
#TODO: Provide a string based on the heading information
return ""
#####
#
# Paragraph class, used for formatted text output from Items
#
#####
class Paragraph(object):
"""Class for encapsulating a (part of a) paragraph of rendered text, along with logic for determining when paragraphs can be connected, and outputting final results."""
def __init__(self,text, renderContext,indentLevel = 0,isMarginalNote = False, forceNewParagraph=False, softSpace=False):
"""text - the raw text of the paragraph
renderContext
indentLevel - level to which the text should be indented
isMarginalNote - True if this paragraph should be rendered as a marginal note
forceNewParagraph - True if this paragraph should not be added to the end of the prior paragraph, even if at the same level
softSpace - True if a space should be added to the end of this paragraph before merging with a alphanumeric-started paragraph."""
self.text = text
self.indentLevel = indentLevel
self.isMarginalNote = isMarginalNote
self.forceNewParagraph = forceNewParagraph
self.softSpace = softSpace
self.renderContext = renderContext
return
def merge(self,nextParagraph):
"""Attempted to merge text with the nextParagraph, returns True if successful, else False."""
if nextParagraph.forceNewParagraph: return False
if self.isMarginalNote or nextParagraph.isMarginalNote: return False #marginal notes can't be merged
if self.indentLevel != nextParagraph.indentLevel: return False
#TODO - when merging a length-0 paragraph, we should presumably maintain our softSpace rule (or do an "or"?). There shouldn't be length-0 paragraphs though.
if self.softSpace: spacer = (u" " if (len(nextParagraph.text) > 0 and nextParagraph.text[0].isalnum()) else u"")
else: spacer = u""
self.text += spacer + nextParagraph.text
self.softSpace = nextParagraph.softSpace
return True
def getRenderedText(self, baseLevel = 0):
if self.isMarginalNote: return self.renderContext.renderMarginalNote(self.text)
indent = self.indentLevel - baseLevel
if indent < 0: intent = 0
return self.renderContext.indentText(self.text, level = indent)