Created a sample code to demo how to scan a pdf file (#48)

* Created a sample code to demo how to scan a pdf file * Applied prettier * Made changes per Ace’s comments * Made changed related to PR comments * Made changes based on Ace’s comments.
GoogleCloudPlatform · Apr 3, 2018 · b5d4480 · b5d4480
1 parent 95d2e92
commit b5d4480
Show file tree

Hide file tree

Showing 3 changed files with 154 additions and 0 deletions.
diff --git a/vision/samples/detect.v1p2beta1.js b/vision/samples/detect.v1p2beta1.js
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2018, Google, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+function detectPdfText(bucketName, fileName) {
+  // [START vision_async_detect_document_ocr]
+
+  // Imports the Google Cloud client libraries
+  const vision = require('@google-cloud/vision').v1p2beta1;
+
+  // Creates a client
+  const client = new vision.ImageAnnotatorClient();
+
+  /**
+   * TODO(developer): Uncomment the following lines before running the sample.
+   */
+  // Bucket where the file resides
+  // const bucketName = 'my-bucket';
+  // Path to PDF file within bucket
+  // const fileName = 'path/to/document.pdf';
+
+  const gcsSourceUri = `gs://${bucketName}/${fileName}`;
+  const gcsDestinationUri = `gs://${bucketName}/${fileName}.json`;
+
+  const inputConfig = {
+    // Supported mime_types are: 'application/pdf' and 'image/tiff'
+    mimeType: 'application/pdf',
+    gcsSource: {
+      uri: gcsSourceUri,
+    },
+  };
+  const outputConfig = {
+    gcsDestination: {
+      uri: gcsDestinationUri,
+    },
+  };
+  const features = [{type: 'DOCUMENT_TEXT_DETECTION'}];
+  const request = {
+    requests: [
+      {
+        inputConfig: inputConfig,
+        features: features,
+        outputConfig: outputConfig,
+      },
+    ],
+  };
+
+  client
+    .asyncBatchAnnotateFiles(request)
+    .then(results => {
+      console.log(results);
+      const operation = results[0];
+      // Get a Promise representation of the final result of the job
+      operation
+        .promise()
+        .then(filesResponse => {
+          let destinationUri =
+            filesResponse[0].responses[0].outputConfig.gcsDestination.uri;
+          console.log('Json saved to: ' + destinationUri);
+        })
+        .catch(function(error) {
+          console.log(error);
+        });
+    })
+    .catch(function(error) {
+      console.log(error);
+    });
+  // [END vision_async_detect_document_ocr]
+}
+
+//.usage('$0 <command> <local-image-file>', 'Cloud Vision Beta API Samples')
+require(`yargs`) // eslint-disable-line
+  .demand(1)
+  .command(
+    `pdf <bucketName> <fileName>`,
+    `Extracts full text from a pdf file`,
+    {},
+    opts => detectPdfText(opts.bucketName, opts.fileName)
+  )
+  .example(`node $0 pdf my-bucket my-pdf.pdf`)
+  .wrap(120)
+  .recommendCommands()
+  .epilogue(`For more information, see https://cloud.google.com/vision/docs`)
+  .help()
+  .strict().argv;
diff --git a/vision/samples/resources/pdf-ocr.pdf b/vision/samples/resources/pdf-ocr.pdf
diff --git a/vision/samples/system-test/detect.v1p2beta1.test.js b/vision/samples/system-test/detect.v1p2beta1.test.js
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2017, Google, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+const path = require(`path`);
+const storage = require(`@google-cloud/storage`)();
+const test = require(`ava`);
+const tools = require(`@google-cloud/nodejs-repo-tools`);
+const uuid = require(`uuid`);
+
+const bucketName = `nodejs-docs-samples-test-${uuid.v4()}`;
+const cmd = `node detect.v1p2beta1.js`;
+const cwd = path.join(__dirname, `..`);
+
+const files = [`pdf-ocr.pdf`].map(name => {
+  return {
+    name,
+    localPath: path.resolve(path.join(__dirname, `../resources/${name}`)),
+  };
+});
+
+test.before(tools.checkCredentials);
+test.before(async () => {
+  const [bucket] = await storage.createBucket(bucketName);
+  await Promise.all(files.map(file => bucket.upload(file.localPath)));
+});
+
+test.after.always(async () => {
+  const bucket = storage.bucket(bucketName);
+  await bucket.deleteFiles({force: true});
+  await bucket.deleteFiles({force: true}); // Try a second time...
+  await bucket.delete();
+});
+
+test.before(tools.checkCredentials);
+
+test(`should extract text from pdf file`, async t => {
+  const output = await tools.runAsync(
+    `${cmd} pdf ${bucketName} ${files[0].name}`,
+    cwd
+  );
+  t.true(output.includes('sample.pdf.json'));
+});