Skip to content

Commit

Permalink
Enable outputting the replacement value on PDFs (#179)
Browse files Browse the repository at this point in the history
* Enable outputting the replacement value on PDFs

* Disable PDF replacement by default, add test, minor refactor

* Add pdf redaction configuration docs.

Fixes #181.
  • Loading branch information
JessieAMorris authored Dec 19, 2024
1 parent 756553a commit 68b7026
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 14 deletions.
40 changes: 40 additions & 0 deletions docs/docs/filter_policies/pdf.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# PDF Redaction Configuration

PDF redaction can be configured through the `config.pdf` path of a policy.

The available options are:

| Key | Type | Default | Description |
|--------------------------|-----------|-------------|-----------------------------------------------------------------------------------------------------------------------------------|
| `redactionColor` | `string` | `black` | This is the color of the redaction boxes that are drawn over the PII. Available options are `white`, `black`, `red`, and `yellow` |
| `showReplacement` | `boolean` | `false` | If `true` then the output of the filter's strategy will be output on the redaction box in the PDF |
| `replacementFont` | `string` | `helvetica` | The font to use for the replacement output. Available options are `helvetica`, `times`, and `courier` |
| `replacementMaxFontSize` | `float` | `12` | The maximum font size for the replacement text. Best efforts will be made to fit the replacement text within the redaction box |
| `replacementFontColor` | `string` | `white` | The font color for the replacement. Available options match the `redactionColor` options |

### An Example PDF Configuration Policy

The following is an example policy setting the PDF redaction options.

```
{
"name": "example-pdf-policy",
"identifiers": {
"emailAddress": {
"emailAddressFilterStrategies": [
{
"strategy": "REDACT",
"redactionFormat": "{{{REDACTED-%t}}}"
}
]
}
},
"config": {
"pdf": {
"redactionColor": "red",
"showReplacement": true,
"replacementFontColor": "yellow"
}
}
}
```
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@
import ai.philterd.phileas.model.policy.filters.strategies.rules.VinFilterStrategy;
import ai.philterd.phileas.model.policy.filters.strategies.rules.ZipCodeFilterStrategy;
import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.File;
import java.io.IOException;
Expand All @@ -72,6 +77,7 @@
import java.util.Set;

public class EndToEndTestsHelper {
private static final Logger LOGGER = LogManager.getLogger(EndToEndTestsHelper.class);


public static Policy getPolicyWithSentiment(String policyName) throws IOException {
Expand Down Expand Up @@ -454,4 +460,16 @@ public static Policy getPolicyJustPhoneNumber(String policyName) {

}

public static boolean documentContainsText(byte[] doc, String needle) throws IOException {
try (PDDocument pdDocument = Loader.loadPDF(doc)) {
PDFTextStripper textStripper = new PDFTextStripper();
String pdfText = textStripper.getText(pdDocument);

if(pdfText.trim().isEmpty()) {
LOGGER.warn("documentContainsText called on a PDF with no text streams");
}

return pdfText.contains(needle);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import java.util.List;
import java.util.Properties;

import static ai.philterd.test.phileas.services.EndToEndTestsHelper.documentContainsText;
import static ai.philterd.test.phileas.services.EndToEndTestsHelper.getPdfPolicy;
import static ai.philterd.test.phileas.services.EndToEndTestsHelper.getPolicy;

Expand Down Expand Up @@ -108,6 +109,8 @@ public void pdf1() throws Exception {
final byte[] document = IOUtils.toByteArray(is);
is.close();

Assertions.assertTrue(documentContainsText(document, "Wendy"));

final Path temp = Files.createTempDirectory("philter");

final File file1 = Paths.get(temp.toFile().getAbsolutePath(), "pdf.json").toFile();
Expand All @@ -133,8 +136,10 @@ public void pdf1() throws Exception {
LOGGER.info("Spans: {}", response.getExplanation().appliedSpans().size());
showSpans(response.getExplanation().appliedSpans());

// TODO: How to assert? MD5 gives a different value each time.

// TODO: This is asserting that it doesn't contain anything as a text stream
// but it's possible that they're in the images, we would need to OCR
// the files for this assertion to be truly valuable
Assertions.assertFalse(documentContainsText(response.getDocument(), "Wendy"));
}

@Test
Expand All @@ -144,6 +149,8 @@ public void pdf2() throws Exception {
final byte[] document = IOUtils.toByteArray(is);
is.close();

Assertions.assertTrue(documentContainsText(document, "90210"));

final Path temp = Files.createTempDirectory("philter");

final File file1 = Paths.get(temp.toFile().getAbsolutePath(), "pdf.json").toFile();
Expand Down Expand Up @@ -172,7 +179,10 @@ public void pdf2() throws Exception {
// output:
// characterStart: 35; characterEnd: 40; filterType: zip-code; context: context; documentId: documentid; confidence: 0.9; text: 90210; replacement: {{{REDACTED-zip-code}}}; salt: ; ignored: false; classification: null;

// TODO: How to assert? MD5 gives a different value each time.
// TODO: This is asserting that it doesn't contain anything as a text stream
// but it's possible that they're in the images, we would need to OCR
// the files for this assertion to be truly valuable
Assertions.assertFalse(documentContainsText(response.getDocument(), "90210"));

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,55 @@

public class Pdf {

@SerializedName("enabled")
@SerializedName("redactionColor")
@Expose
private String redactionColor = "black";

@SerializedName("showReplacement")
@Expose
private boolean showReplacement = false;

@SerializedName("replacementFont")
@Expose
private String replacementFont = "helvetica";

@SerializedName("replacementMaxFontSize")
@Expose
private float replacementMaxFontSize = 12;

@SerializedName("replacementFontColor")
@Expose
private String replacementFontColor;

public String getRedactionColor() {
return redactionColor;
}

public void setRedactionColor(String redactionColor) {
this.redactionColor = redactionColor;
public void setRedactionColor(String replacementColor) {
this.redactionColor = replacementColor;
}

public String getReplacementFont() {
return replacementFont;
}

public void setReplacementFont(String replacementFont) {
this.replacementFont = replacementFont;
}

public float getReplacementMaxFontSize() {
return replacementMaxFontSize;
}

public String getReplacementFontColor() {
return replacementFontColor;
}

public boolean getShowReplacement() {
return showReplacement;
}

public void setShowReplacement(boolean showReplacement) {
this.showReplacement = showReplacement;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
Expand All @@ -44,8 +47,19 @@
import javax.imageio.ImageWriteParam;
import javax.imageio.ImageWriter;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

Expand All @@ -64,13 +78,24 @@ public class PdfRedacter extends PDFTextStripper implements Redacter {
private final List<BoundingBox> boundingBoxes;

private static final Map<String, PDColor> COLORS = new LinkedHashMap<>();
private static final Map<String, PDFont> FONTS = new LinkedHashMap<>();

static {
COLORS.put("white", new PDColor(new float[]{255, 255, 255}, PDDeviceRGB.INSTANCE));
COLORS.put("black", new PDColor(new float[]{0, 0, 0}, PDDeviceRGB.INSTANCE));
COLORS.put("red", new PDColor(new float[]{255, 0, 0}, PDDeviceRGB.INSTANCE));
COLORS.put("yellow", new PDColor(new float[]{1, 1, 100 / 255F}, PDDeviceRGB.INSTANCE));

FONTS.put("helvetica", new PDType1Font(Standard14Fonts.FontName.HELVETICA));
FONTS.put("times", new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN));
FONTS.put("courier", new PDType1Font(Standard14Fonts.FontName.COURIER));
}

private final boolean showReplacement;
private final float replacementFontSize;
private final PDFont replacementFont;
private final PDColor replacementFontColor;

public PdfRedacter(Policy policy,
Set<Span> spans, PdfRedactionOptions pdfRedactionOptions,
List<BoundingBox> boundingBoxes) throws IOException {
Expand All @@ -79,6 +104,10 @@ public PdfRedacter(Policy policy,
this.spans = spans;
this.pdfRedactionOptions = pdfRedactionOptions;
this.boundingBoxes = boundingBoxes;
this.showReplacement = policy.getConfig().getPdf().getShowReplacement();
this.replacementFont = FONTS.getOrDefault(policy.getConfig().getPdf().getReplacementFont(), FONTS.get("helvetica"));
this.replacementFontSize = policy.getConfig().getPdf().getReplacementMaxFontSize();
this.replacementFontColor = COLORS.getOrDefault(policy.getConfig().getPdf().getReplacementFontColor(), COLORS.get("white"));

}

Expand Down Expand Up @@ -209,27 +238,63 @@ protected void endDocument(PDDocument doc) throws IOException {
for(int pageNumber : rectangles.keySet()) {

final PDPage page = document.getPage(pageNumber);
final PDPageContentStream contentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true);
final PDPageContentStream rectContentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true);
final PDPageContentStream textContentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true);

for(final RedactedRectangle rectangle : rectangles.get(pageNumber)) {

contentStream.addRect(
rectContentStream.addRect(
rectangle.getPdRectangle().getLowerLeftX(),
rectangle.getPdRectangle().getLowerLeftY() - 3,
rectangle.getPdRectangle().getWidth(),
rectangle.getPdRectangle().getHeight() + buffer);

if(showReplacement) {
addReplacementTextToRect(rectangle, textContentStream);
}
}

// Get the color based on the filter.
final PDColor pdColor = COLORS.getOrDefault(policy.getConfig().getPdf().getRedactionColor(), COLORS.get("black"));
contentStream.setNonStrokingColor(pdColor);
contentStream.setRenderingMode(RenderingMode.FILL);
contentStream.fill();
contentStream.close();
rectContentStream.setNonStrokingColor(pdColor);
rectContentStream.setRenderingMode(RenderingMode.FILL);
rectContentStream.fill();
rectContentStream.close();

textContentStream.close();

}

}

public void addReplacementTextToRect(RedactedRectangle rectangle, PDPageContentStream textContentStream) throws IOException {
var replacementText = rectangle.getSpan().getReplacement();
var rectangleWidth = rectangle.getPdRectangle().getWidth();
var rectangleHeight = rectangle.getPdRectangle().getHeight();

var boxFontSize = replacementFontSize;
float textWidth = (replacementFont.getStringWidth(replacementText) / 1000.0f) * boxFontSize;
while (textWidth > rectangleWidth) {
boxFontSize -= 1;
textWidth = (replacementFont.getStringWidth(replacementText) / 1000.0f) * boxFontSize;
}

// Y position is actually based on the font's "baseline", so we use the descent
// (how far the font goes under the baseline) for the height calculation
var textDescent = (replacementFont.getFontDescriptor().getDescent() / 1000.0f) * boxFontSize;

var textXLocation = (rectangle.getPdRectangle().getLowerLeftX() +
((rectangleWidth / 2.0f) - (textWidth / 2.0f)));

var textYLocation = (rectangle.getPdRectangle().getLowerLeftY() +
((rectangleHeight / 2.0f) + (textDescent / 2.0f)));

textContentStream.beginText();
textContentStream.setNonStrokingColor(replacementFontColor);
textContentStream.setFont(replacementFont, boxFontSize);
textContentStream.newLineAtOffset(textXLocation, textYLocation);
textContentStream.showText(replacementText);
textContentStream.endText();
}

@Override
Expand Down
Loading

0 comments on commit 68b7026

Please sign in to comment.