Skip to content

Commit f49b659

Browse files
sirtorrybshaffer
authored andcommitted
Vision PDF/OCR (GoogleCloudPlatform#622)
1 parent 424469e commit f49b659

28 files changed

+429
-289
lines changed

vision/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ Vision API from PHP.
3939
text Detect text in an image using Google Cloud Vision API
4040
crop-hints Detect crop hints in an image using Google Cloud Vision API
4141
document-text Detect document text in an image using Google Cloud Vision API
42+
pdf Detect text in a PDF/TIFF using Google Cloud Vision API
4243
web Detect web entities in an image using Google Cloud Vision API
4344
web-geo Detect web entities in an image with geo metadata using
4445
Google Cloud Vision API

vision/composer.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "google/vision",
33
"type": "project",
44
"require": {
5-
"google/cloud-vision": "^0.10",
5+
"google/cloud-vision": "^0.13",
66
"google/cloud-storage": "^1.3",
77
"symfony/console": "^3.1"
88
},
@@ -32,7 +32,8 @@
3232
"src/detect_web.php",
3333
"src/detect_web_gcs.php",
3434
"src/detect_web_with_geo_metadata.php",
35-
"src/detect_web_with_geo_metadata_gcs.php"
35+
"src/detect_web_with_geo_metadata_gcs.php",
36+
"src/detect_pdf_gcs.php"
3637
]
3738
},
3839
"require-dev": {

vision/composer.lock

Lines changed: 233 additions & 276 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vision/src/detect_crop_hints.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,7 @@ function detect_crop_hints($path)
4747
} else {
4848
print('No crop hints' . PHP_EOL);
4949
}
50+
51+
$imageAnnotator->close();
5052
}
5153
// [END vision_crop_hint_detection]

vision/src/detect_crop_hints_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,7 @@ function detect_crop_hints_gcs($path)
4646
} else {
4747
print('No crop hints' . PHP_EOL);
4848
}
49+
50+
$imageAnnotator->close();
4951
}
5052
// [END vision_crop_hint_detection_gcs]

vision/src/detect_document_text.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,5 +63,7 @@ function detect_document_text($path)
6363
} else {
6464
print('No text found' . PHP_EOL);
6565
}
66+
67+
$imageAnnotator->close();
6668
}
6769
// [END vision_fulltext_detection]

vision/src/detect_document_text_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,5 +63,7 @@ function detect_document_text_gcs($path)
6363
} else {
6464
print('No text found' . PHP_EOL);
6565
}
66+
67+
$imageAnnotator->close();
6668
}
6769
// [END vision_fulltext_detection_gcs]

vision/src/detect_face.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ function detect_face($path, $outFile = null)
103103
printf('Output image written to %s' . PHP_EOL, $outFile);
104104
# [END vision_face_detection_tutorial_run_application]
105105
}
106+
107+
$imageAnnotator->close();
106108
// [START vision_face_detection]
107109
}
108110
// [END vision_face_detection]

vision/src/detect_face_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,5 +54,7 @@ function detect_face_gcs($path)
5454
print('Bounds: ' . join(', ',$bounds) . PHP_EOL);
5555
print(PHP_EOL);
5656
}
57+
58+
$imageAnnotator->close();
5759
}
5860
// [END vision_face_detection_gcs]

vision/src/detect_image_property.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,7 @@ function detect_image_property($path)
4040
printf("Blue: %s" . PHP_EOL, $color->getBlue());
4141
print(PHP_EOL);
4242
}
43+
44+
$imageAnnotator->close();
4345
}
4446
// [END vision_image_property_detection]

vision/src/detect_image_property_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,5 +44,7 @@ function detect_image_property_gcs($path)
4444
} else {
4545
print('No Results.' . PHP_EOL);
4646
}
47+
48+
$imageAnnotator->close();
4749
}
4850
// [END vision_image_property_detection_gcs]

vision/src/detect_label.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,7 @@ function detect_label($path)
3939
} else {
4040
print('No label found' . PHP_EOL);
4141
}
42+
43+
$imageAnnotator->close();
4244
}
4345
// [END vision_label_detection]

vision/src/detect_label_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,7 @@ function detect_label_gcs($path)
3838
} else {
3939
print('No label found' . PHP_EOL);
4040
}
41+
42+
$imageAnnotator->close();
4143
}
4244
// [END vision_label_detection_gcs]

vision/src/detect_landmark.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,7 @@ function detect_landmark($path)
3535
foreach ($landmarks as $landmark) {
3636
print($landmark->getDescription() . PHP_EOL);
3737
}
38+
39+
$imageAnnotator->close();
3840
}
3941
// [END vision_landmark_detection]

vision/src/detect_landmark_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,7 @@ function detect_landmark_gcs($path)
3434
foreach ($landmarks as $landmark) {
3535
print($landmark->getDescription() . PHP_EOL);
3636
}
37+
38+
$imageAnnotator->close();
3739
}
3840
// [END vision_landmark_detection_gcs]

vision/src/detect_logo.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,7 @@ function detect_logo($path)
3535
foreach ($logos as $logo) {
3636
print($logo->getDescription() . PHP_EOL);
3737
}
38+
39+
$imageAnnotator->close();
3840
}
3941
// [END vision_logo_detection]

vision/src/detect_logo_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,7 @@ function detect_logo_gcs($path)
3434
foreach ($logos as $logo) {
3535
print($logo->getDescription() . PHP_EOL);
3636
}
37+
38+
$imageAnnotator->close();
3739
}
3840
// [END vision_logo_detection_gcs]

vision/src/detect_pdf_gcs.php

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
2+
/**
3+
* Copyright 2018 Google Inc.
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
// [START vision_text_detection_pdf_gcs]
19+
namespace Google\Cloud\Samples\Vision;
20+
21+
use Google\Cloud\Storage\StorageClient;
22+
use Google\Cloud\Vision\V1\AnnotateFileResponse;
23+
use Google\Cloud\Vision\V1\AsyncAnnotateFileRequest;
24+
use Google\Cloud\Vision\V1\Feature;
25+
use Google\Cloud\Vision\V1\Feature_Type;
26+
use Google\Cloud\Vision\V1\GcsDestination;
27+
use Google\Cloud\Vision\V1\GcsSource;
28+
use Google\Cloud\Vision\V1\ImageAnnotatorClient;
29+
use Google\Cloud\Vision\V1\InputConfig;
30+
use Google\Cloud\Vision\V1\OutputConfig;
31+
32+
// $path = 'gs://path/to/your/document.pdf'
33+
34+
function detect_pdf_gcs($path, $output)
35+
{
36+
# select ocr feature
37+
$feature = (new Feature())
38+
->setType(Feature_Type::DOCUMENT_TEXT_DETECTION);
39+
40+
# set $path (file to OCR) as source
41+
$gcsSource = (new GcsSource())
42+
->setUri($path);
43+
# supported mime_types are: 'application/pdf' and 'image/tiff'
44+
$mimeType = 'application/pdf';
45+
$inputConfig = (new InputConfig())
46+
->setGcsSource($gcsSource)
47+
->setMimeType($mimeType);
48+
49+
# set $output as destination
50+
$gcsDestination = (new GcsDestination())
51+
->setUri($output);
52+
# how many pages should be grouped into each json output file.
53+
$batchSize = 2;
54+
$outputConfig = (new OutputConfig())
55+
->setGcsDestination($gcsDestination)
56+
->setBatchSize($batchSize);
57+
58+
# prepare request using configs set above
59+
$request = (new AsyncAnnotateFileRequest())
60+
->setFeatures([$feature])
61+
->setInputConfig($inputConfig)
62+
->setOutputConfig($outputConfig);
63+
$requests = [$request];
64+
65+
# make request
66+
$imageAnnotator = new ImageAnnotatorClient();
67+
$operation = $imageAnnotator->asyncBatchAnnotateFiles($requests);
68+
print('Waiting for operation to finish.' . PHP_EOL);
69+
$operation->pollUntilComplete();
70+
71+
# once the request has completed and the output has been
72+
# written to GCS, we can list all the output files.
73+
preg_match('/^gs:\/\/([a-z0-9\._\-]+)\/(\S+)$/', $output, $match);
74+
$bucketName = $match[1];
75+
$prefix = $match[2];
76+
77+
$storage = new StorageClient();
78+
$bucket = $storage->bucket($bucketName);
79+
$options = ['prefix' => $prefix];
80+
$objects = $bucket->objects($options);
81+
82+
# save first object for sample below
83+
$objects->next();
84+
$firstObject = $objects->current();
85+
86+
# list objects with the given prefix.
87+
print('Output files:' . PHP_EOL);
88+
foreach ($objects as $object) {
89+
print($object->name() . PHP_EOL);
90+
}
91+
92+
# process the first output file from GCS.
93+
# since we specified batch_size=2, the first response contains
94+
# the first two pages of the input file.
95+
$jsonString = $firstObject->downloadAsString();
96+
$firstBatch = new AnnotateFileResponse();
97+
$firstBatch->mergeFromJsonString($jsonString);
98+
99+
# get annotation and print text
100+
foreach ($firstBatch->getResponses() as $response) {
101+
$annotation = $response->getFullTextAnnotation();
102+
print($annotation->getText());
103+
}
104+
105+
$imageAnnotator->close();
106+
}
107+
// [END vision_text_detection_pdf_gcs]

vision/src/detect_safe_search.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,7 @@ function detect_safe_search($path)
4646
printf("Spoof: %s" . PHP_EOL, $likelihoodName[$spoof]);
4747
printf("Violence: %s" . PHP_EOL, $likelihoodName[$violence]);
4848
printf("Racy: %s" . PHP_EOL, $likelihoodName[$racy]);
49+
50+
$imageAnnotator->close();
4951
}
5052
// [END vision_safe_search_detection]

vision/src/detect_safe_search_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,7 @@ function detect_safe_search_gcs($path)
4949
} else {
5050
print('No Results.' . PHP_EOL);
5151
}
52+
53+
$imageAnnotator->close();
5254
}
5355
// [END vision_safe_search_detection_gcs]

vision/src/detect_text.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,7 @@ function detect_text($path)
4343
}
4444
print('Bounds: ' . join(', ',$bounds) . PHP_EOL);
4545
}
46+
47+
$imageAnnotator->close();
4648
}
4749
// [END vision_text_detection]

vision/src/detect_text_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,7 @@ function detect_text_gcs($path)
4242
}
4343
print('Bounds: ' . join(', ',$bounds) . PHP_EOL);
4444
}
45+
46+
$imageAnnotator->close();
4547
}
4648
// [END vision_text_detection_gcs]

vision/src/detect_web.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,5 +79,7 @@ function detect_web($path)
7979
$entity->getDescription(),
8080
$entity->getScore());
8181
}
82+
83+
$imageAnnotator->close();
8284
}
8385
// [END vision_web_detection]

vision/src/detect_web_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,5 +81,7 @@ function detect_web_gcs($path)
8181
} else {
8282
print('No Results.' . PHP_EOL);
8383
}
84+
85+
$imageAnnotator->close();
8486
}
8587
// [END vision_web_detection_gcs]

vision/src/detect_web_with_geo_metadata.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,7 @@ function detect_web_with_geo_metadata($path)
5252
print(PHP_EOL);
5353
}
5454
}
55+
56+
$imageAnnotator->close();
5557
}
5658
// [END vision_web_detection_include_geo]

vision/src/detect_web_with_geo_metadata_gcs.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,5 +53,7 @@ function detect_web_with_geo_metadata_gcs($path)
5353
} else {
5454
print('No Results.' . PHP_EOL);
5555
}
56+
57+
$imageAnnotator->close();
5658
}
5759
// [END vision_web_detection_include_geo_gcs]

vision/test/visionTest.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,16 @@ public function testDocumentTextCommandGcs()
271271
$this->assertContains('Bounds:', $output);
272272
}
273273

274+
public function testPdfGcs()
275+
{
276+
$this->requireCloudStorage();
277+
278+
$source = 'gs://' . $this->bucketName . '/HodgeConj.pdf';
279+
$destination = 'gs://' . $this->bucketName . '/OCR_PDF_TEST_OUTPUT/';
280+
$output = $this->runCommand('pdf', $source, $destination);
281+
$this->assertContains('Output files:', $output);
282+
}
283+
274284
public function testDetectWebNoGeoCommand()
275285
{
276286
$path = __DIR__ . '/data/geotagged.jpg';

0 commit comments

Comments
 (0)