Skip to content

Commit

Permalink
chore: added conditonal check to prevent indexOutOfBound Exception (#343
Browse files Browse the repository at this point in the history
)

* chore: added conditonal check to prevent indexOfBOund Exception

* nit

* removed first lang part from batchTable sample
  • Loading branch information
munkhuushmgl authored and Shabirmean committed Nov 15, 2022
1 parent f4be66b commit edcb284
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ public static void batchParseFormGcs(
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
try (DocumentUnderstandingServiceClient client =
DocumentUnderstandingServiceClient.create()) {
try (DocumentUnderstandingServiceClient client = DocumentUnderstandingServiceClient.create()) {

// Configure the request for processing the PDF
String parent = String.format("projects/%s/locations/%s", projectId, location);
Expand Down Expand Up @@ -103,17 +102,16 @@ public static void batchParseFormGcs(
// mime_type can be application/pdf, image/tiff,
// and image/gif, or application/json
InputConfig config =
InputConfig.newBuilder().setGcsSource(inputUri)
.setMimeType("application/pdf").build();
InputConfig.newBuilder().setGcsSource(inputUri).setMimeType("application/pdf").build();

GcsDestination gcsDestination = GcsDestination.newBuilder()
.setUri(String.format("gs://%s/%s", outputGcsBucketName, outputGcsPrefix)).build();

OutputConfig outputConfig = OutputConfig.newBuilder()
.setGcsDestination(gcsDestination)
.setPagesPerShard(1)
GcsDestination gcsDestination =
GcsDestination.newBuilder()
.setUri(String.format("gs://%s/%s", outputGcsBucketName, outputGcsPrefix))
.build();

OutputConfig outputConfig =
OutputConfig.newBuilder().setGcsDestination(gcsDestination).setPagesPerShard(1).build();

ProcessDocumentRequest request =
ProcessDocumentRequest.newBuilder()
.setFormExtractionParams(params)
Expand Down Expand Up @@ -165,13 +163,15 @@ public static void batchParseFormGcs(
String text = document.getText();

// Process the output.
Document.Page page1 = document.getPages(0);
for (Document.Page.FormField field : page1.getFormFieldsList()) {
String fieldName = getText(field.getFieldName(), text);
String fieldValue = getText(field.getFieldValue(), text);

System.out.println("Extracted form fields pair:");
System.out.printf("\t(%s, %s))", fieldName, fieldValue);
if (document.getPagesCount() > 0) {
Document.Page page1 = document.getPages(0);
for (Document.Page.FormField field : page1.getFormFieldsList()) {
String fieldName = getText(field.getFieldName(), text);
String fieldValue = getText(field.getFieldValue(), text);

System.out.println("Extracted form fields pair:");
System.out.printf("\t(%s, %s))", fieldName, fieldValue);
}
}

// Clean up temp file.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,24 +165,30 @@ public static void batchParseTableGcs(
String text = document.getText();

// Process the output.
Document.Page page1 = document.getPages(0);
Document.Page.Table table = page1.getTables(0);

System.out.println("Results from first table processed:");
System.out.println("Header row:");

Document.Page.Table.TableRow headerRow = table.getHeaderRows(0);

for (Document.Page.Table.TableCell tableCell : headerRow.getCellsList()) {
if (!tableCell.getLayout().getTextAnchor().getTextSegmentsList().isEmpty()) {
// Extract shards from the text field
// First shard in document doesn't have startIndex property
List<Document.TextAnchor.TextSegment> textSegments =
tableCell.getLayout().getTextAnchor().getTextSegmentsList();
int startIdx =
textSegments.size() > 0 ? (int) textSegments.get(0).getStartIndex() : 0;
int endIdx = (int) textSegments.get(0).getEndIndex();
System.out.printf("\t%s", text.substring(startIdx, endIdx));
if (document.getPagesCount() > 0) {
Document.Page page1 = document.getPages(0);
if (page1.getTablesCount() > 0) {
Document.Page.Table table = page1.getTables(0);

System.out.println("Results from first table processed:");
System.out.println("Header row:");

if (table.getHeaderRowsCount() > 0) {
Document.Page.Table.TableRow headerRow = table.getHeaderRows(0);

for (Document.Page.Table.TableCell tableCell : headerRow.getCellsList()) {
if (!tableCell.getLayout().getTextAnchor().getTextSegmentsList().isEmpty()) {
// Extract shards from the text field
// First shard in document doesn't have startIndex property
List<Document.TextAnchor.TextSegment> textSegments =
tableCell.getLayout().getTextAnchor().getTextSegmentsList();
int startIdx =
textSegments.size() > 0 ? (int) textSegments.get(0).getStartIndex() : 0;
int endIdx = (int) textSegments.get(0).getEndIndex();
System.out.printf("\t%s", text.substring(startIdx, endIdx));
}
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,15 @@ public static void parseForm(String projectId, String location, String inputGcsU
String text = response.getText();

// Process the output
Document.Page page1 = response.getPages(0);
for (Document.Page.FormField field : page1.getFormFieldsList()) {
String fieldName = getText(field.getFieldName(), text);
String fieldValue = getText(field.getFieldValue(), text);
if (response.getPagesCount() > 0) {
Document.Page page1 = response.getPages(0);
for (Document.Page.FormField field : page1.getFormFieldsList()) {
String fieldName = getText(field.getFieldName(), text);
String fieldValue = getText(field.getFieldValue(), text);

System.out.println("Extracted form fields pair:");
System.out.printf("\t(%s, %s))", fieldName, fieldValue);
System.out.println("Extracted form fields pair:");
System.out.printf("\t(%s, %s))", fieldName, fieldValue);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,23 +94,27 @@ public static void parseTable(String projectId, String location, String inputGcs
String text = response.getText();

// Get the first table in the document
Document.Page page1 = response.getPages(0);
Document.Page.Table table = page1.getTables(0);

System.out.println("Results from first table processed:");
List<Document.Page.DetectedLanguage> detectedLangs = page1.getDetectedLanguagesList();
String langCode =
detectedLangs.size() > 0 ? detectedLangs.get(0).getLanguageCode() : "NOT_FOUND";
System.out.printf("First detected language: : %s", langCode);

Document.Page.Table.TableRow headerRow = table.getHeaderRows(0);
System.out.println("Header row:");

for (Document.Page.Table.TableCell tableCell : headerRow.getCellsList()) {
if (tableCell.getLayout().getTextAnchor().getTextSegmentsList() != null) {
// Extract shards from the text field
// First shard in document doesn't have startIndex property
System.out.printf("\t%s", getText(tableCell.getLayout(), text));
if (response.getPagesCount() > 0) {
Document.Page page1 = response.getPages(0);
if (page1.getTablesCount() > 0) {
Document.Page.Table table = page1.getTables(0);

System.out.println("Results from first table processed:");
List<Document.Page.DetectedLanguage> detectedLangs = page1.getDetectedLanguagesList();
String langCode =
detectedLangs.size() > 0 ? detectedLangs.get(0).getLanguageCode() : "NOT_FOUND";
System.out.printf("First detected language: : %s", langCode);

Document.Page.Table.TableRow headerRow = table.getHeaderRows(0);
System.out.println("Header row:");

for (Document.Page.Table.TableCell tableCell : headerRow.getCellsList()) {
if (tableCell.getLayout().getTextAnchor().getTextSegmentsList() != null) {
// Extract shards from the text field
// First shard in document doesn't have startIndex property
System.out.printf("\t%s", getText(tableCell.getLayout(), text));
}
}
}
}
}
Expand Down

0 comments on commit edcb284

Please sign in to comment.