Skip to content

Commit

Permalink
#5265 - Extract paragraph structure from PDF files
Browse files Browse the repository at this point in the history
- Remove some unused legacy classes
- Set up a basic HTML structure in the CASes extracted from PDF files based on the paragraph detection from pdfbox
  • Loading branch information
reckart committed Jan 27, 2025
1 parent 966e106 commit 7a3ad3e
Show file tree
Hide file tree
Showing 9 changed files with 256 additions and 488 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ public class CasXmlHandler
private XmlDocument docNode;
private boolean captureText = true;
private boolean splitSentencesInBlockElements = true;
private boolean commitText = true;

private final Set<ElementListener> listeners = new LinkedHashSet<>();

Expand All @@ -67,6 +68,11 @@ public CasXmlHandler(JCas aJCas)
stack = new ArrayDeque<>();
}

public void setCommitText(boolean aCommitText)
{
commitText = aCommitText;
}

public void addListener(ElementListener aListener)
{
listeners.add(aListener);
Expand Down Expand Up @@ -125,7 +131,9 @@ public void endDocument() throws SAXException
l.endDocument(docNode);
}

jcas.setDocumentText(text.toString());
if (commitText) {
jcas.setDocumentText(text.toString());
}

if (!blockElements.isEmpty()) {
if (splitSentencesInBlockElements) {
Expand Down Expand Up @@ -213,7 +221,7 @@ public void endElement(String aUri, String aLocalName, String aQName) throws SAX
}

@Override
public void characters(char[] aCh, int aStart, int aLength) throws SAXException
public void characters(char[] aCh, int aStart, int aLength)
{
if (stack.isEmpty()) {
// We ignore any characters outside the root elements. These could include e.g.
Expand Down Expand Up @@ -241,7 +249,7 @@ public void characters(char[] aCh, int aStart, int aLength) throws SAXException
}

@Override
public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException
public void ignorableWhitespace(char[] aCh, int aStart, int aLength)
{
characters(aCh, aStart, aLength);
}
Expand Down

This file was deleted.

Loading

0 comments on commit 7a3ad3e

Please sign in to comment.