You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
592 lines
19 KiB
592 lines
19 KiB
// Copyright 2017 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
syntax = "proto3";
|
|
|
|
package google.cloud.vision.v1p1beta1;
|
|
|
|
import "google/api/annotations.proto";
|
|
import "google/cloud/vision/v1p1beta1/geometry.proto";
|
|
import "google/cloud/vision/v1p1beta1/text_annotation.proto";
|
|
import "google/cloud/vision/v1p1beta1/web_detection.proto";
|
|
import "google/rpc/status.proto";
|
|
import "google/type/color.proto";
|
|
import "google/type/latlng.proto";
|
|
|
|
option cc_enable_arenas = true;
|
|
option go_package = "google.golang.org/genproto/googleapis/cloud/vision/v1p1beta1;vision";
|
|
option java_multiple_files = true;
|
|
option java_outer_classname = "ImageAnnotatorProto";
|
|
option java_package = "com.google.cloud.vision.v1p1beta1";
|
|
|
|
// Service that performs Google Cloud Vision API detection tasks over client
|
|
// images, such as face, landmark, logo, label, and text detection. The
|
|
// ImageAnnotator service returns detected entities from the images.
|
|
service ImageAnnotator {
|
|
// Run image detection and annotation for a batch of images.
|
|
rpc BatchAnnotateImages(BatchAnnotateImagesRequest)
|
|
returns (BatchAnnotateImagesResponse) {
|
|
option (google.api.http) = {
|
|
post: "/v1p1beta1/images:annotate"
|
|
body: "*"
|
|
};
|
|
}
|
|
}
|
|
|
|
// Users describe the type of Google Cloud Vision API tasks to perform over
|
|
// images by using *Feature*s. Each Feature indicates a type of image
|
|
// detection task to perform. Features encode the Cloud Vision API
|
|
// vertical to operate on and the number of top-scoring results to return.
|
|
message Feature {
|
|
// Type of image feature.
|
|
enum Type {
|
|
// Unspecified feature type.
|
|
TYPE_UNSPECIFIED = 0;
|
|
|
|
// Run face detection.
|
|
FACE_DETECTION = 1;
|
|
|
|
// Run landmark detection.
|
|
LANDMARK_DETECTION = 2;
|
|
|
|
// Run logo detection.
|
|
LOGO_DETECTION = 3;
|
|
|
|
// Run label detection.
|
|
LABEL_DETECTION = 4;
|
|
|
|
// Run OCR.
|
|
TEXT_DETECTION = 5;
|
|
|
|
// Run dense text document OCR. Takes precedence when both
|
|
// DOCUMENT_TEXT_DETECTION and TEXT_DETECTION are present.
|
|
DOCUMENT_TEXT_DETECTION = 11;
|
|
|
|
// Run computer vision models to compute image safe-search properties.
|
|
SAFE_SEARCH_DETECTION = 6;
|
|
|
|
// Compute a set of image properties, such as the image's dominant colors.
|
|
IMAGE_PROPERTIES = 7;
|
|
|
|
// Run crop hints.
|
|
CROP_HINTS = 9;
|
|
|
|
// Run web detection.
|
|
WEB_DETECTION = 10;
|
|
}
|
|
|
|
// The feature type.
|
|
Type type = 1;
|
|
|
|
// Maximum number of results of this type.
|
|
int32 max_results = 2;
|
|
|
|
// Model to use for the feature.
|
|
// Supported values: "builtin/stable" (the default if unset) and
|
|
// "builtin/latest".
|
|
string model = 3;
|
|
}
|
|
|
|
// External image source (Google Cloud Storage image location).
|
|
message ImageSource {
|
|
// NOTE: For new code `image_uri` below is preferred.
|
|
// Google Cloud Storage image URI, which must be in the following form:
|
|
// `gs://bucket_name/object_name` (for details, see
|
|
// [Google Cloud Storage Request
|
|
// URIs](https://cloud.google.com/storage/docs/reference-uris)).
|
|
// NOTE: Cloud Storage object versioning is not supported.
|
|
string gcs_image_uri = 1;
|
|
|
|
// Image URI which supports:
|
|
// 1) Google Cloud Storage image URI, which must be in the following form:
|
|
// `gs://bucket_name/object_name` (for details, see
|
|
// [Google Cloud Storage Request
|
|
// URIs](https://cloud.google.com/storage/docs/reference-uris)).
|
|
// NOTE: Cloud Storage object versioning is not supported.
|
|
// 2) Publicly accessible image HTTP/HTTPS URL.
|
|
// This is preferred over the legacy `gcs_image_uri` above. When both
|
|
// `gcs_image_uri` and `image_uri` are specified, `image_uri` takes
|
|
// precedence.
|
|
string image_uri = 2;
|
|
}
|
|
|
|
// Client image to perform Google Cloud Vision API tasks over.
|
|
message Image {
|
|
// Image content, represented as a stream of bytes.
|
|
// Note: as with all `bytes` fields, protobuffers use a pure binary
|
|
// representation, whereas JSON representations use base64.
|
|
bytes content = 1;
|
|
|
|
// Google Cloud Storage image location. If both `content` and `source`
|
|
// are provided for an image, `content` takes precedence and is
|
|
// used to perform the image annotation request.
|
|
ImageSource source = 2;
|
|
}
|
|
|
|
// A face annotation object contains the results of face detection.
|
|
message FaceAnnotation {
|
|
// A face-specific landmark (for example, a face feature).
|
|
message Landmark {
|
|
// Face landmark (feature) type.
|
|
// Left and right are defined from the vantage of the viewer of the image
|
|
// without considering mirror projections typical of photos. So, `LEFT_EYE`,
|
|
// typically, is the person's right eye.
|
|
enum Type {
|
|
// Unknown face landmark detected. Should not be filled.
|
|
UNKNOWN_LANDMARK = 0;
|
|
|
|
// Left eye.
|
|
LEFT_EYE = 1;
|
|
|
|
// Right eye.
|
|
RIGHT_EYE = 2;
|
|
|
|
// Left of left eyebrow.
|
|
LEFT_OF_LEFT_EYEBROW = 3;
|
|
|
|
// Right of left eyebrow.
|
|
RIGHT_OF_LEFT_EYEBROW = 4;
|
|
|
|
// Left of right eyebrow.
|
|
LEFT_OF_RIGHT_EYEBROW = 5;
|
|
|
|
// Right of right eyebrow.
|
|
RIGHT_OF_RIGHT_EYEBROW = 6;
|
|
|
|
// Midpoint between eyes.
|
|
MIDPOINT_BETWEEN_EYES = 7;
|
|
|
|
// Nose tip.
|
|
NOSE_TIP = 8;
|
|
|
|
// Upper lip.
|
|
UPPER_LIP = 9;
|
|
|
|
// Lower lip.
|
|
LOWER_LIP = 10;
|
|
|
|
// Mouth left.
|
|
MOUTH_LEFT = 11;
|
|
|
|
// Mouth right.
|
|
MOUTH_RIGHT = 12;
|
|
|
|
// Mouth center.
|
|
MOUTH_CENTER = 13;
|
|
|
|
// Nose, bottom right.
|
|
NOSE_BOTTOM_RIGHT = 14;
|
|
|
|
// Nose, bottom left.
|
|
NOSE_BOTTOM_LEFT = 15;
|
|
|
|
// Nose, bottom center.
|
|
NOSE_BOTTOM_CENTER = 16;
|
|
|
|
// Left eye, top boundary.
|
|
LEFT_EYE_TOP_BOUNDARY = 17;
|
|
|
|
// Left eye, right corner.
|
|
LEFT_EYE_RIGHT_CORNER = 18;
|
|
|
|
// Left eye, bottom boundary.
|
|
LEFT_EYE_BOTTOM_BOUNDARY = 19;
|
|
|
|
// Left eye, left corner.
|
|
LEFT_EYE_LEFT_CORNER = 20;
|
|
|
|
// Right eye, top boundary.
|
|
RIGHT_EYE_TOP_BOUNDARY = 21;
|
|
|
|
// Right eye, right corner.
|
|
RIGHT_EYE_RIGHT_CORNER = 22;
|
|
|
|
// Right eye, bottom boundary.
|
|
RIGHT_EYE_BOTTOM_BOUNDARY = 23;
|
|
|
|
// Right eye, left corner.
|
|
RIGHT_EYE_LEFT_CORNER = 24;
|
|
|
|
// Left eyebrow, upper midpoint.
|
|
LEFT_EYEBROW_UPPER_MIDPOINT = 25;
|
|
|
|
// Right eyebrow, upper midpoint.
|
|
RIGHT_EYEBROW_UPPER_MIDPOINT = 26;
|
|
|
|
// Left ear tragion.
|
|
LEFT_EAR_TRAGION = 27;
|
|
|
|
// Right ear tragion.
|
|
RIGHT_EAR_TRAGION = 28;
|
|
|
|
// Left eye pupil.
|
|
LEFT_EYE_PUPIL = 29;
|
|
|
|
// Right eye pupil.
|
|
RIGHT_EYE_PUPIL = 30;
|
|
|
|
// Forehead glabella.
|
|
FOREHEAD_GLABELLA = 31;
|
|
|
|
// Chin gnathion.
|
|
CHIN_GNATHION = 32;
|
|
|
|
// Chin left gonion.
|
|
CHIN_LEFT_GONION = 33;
|
|
|
|
// Chin right gonion.
|
|
CHIN_RIGHT_GONION = 34;
|
|
}
|
|
|
|
// Face landmark type.
|
|
Type type = 3;
|
|
|
|
// Face landmark position.
|
|
Position position = 4;
|
|
}
|
|
|
|
// The bounding polygon around the face. The coordinates of the bounding box
|
|
// are in the original image's scale, as returned in `ImageParams`.
|
|
// The bounding box is computed to "frame" the face in accordance with human
|
|
// expectations. It is based on the landmarker results.
|
|
// Note that one or more x and/or y coordinates may not be generated in the
|
|
// `BoundingPoly` (the polygon will be unbounded) if only a partial face
|
|
// appears in the image to be annotated.
|
|
BoundingPoly bounding_poly = 1;
|
|
|
|
// The `fd_bounding_poly` bounding polygon is tighter than the
|
|
// `boundingPoly`, and encloses only the skin part of the face. Typically, it
|
|
// is used to eliminate the face from any image analysis that detects the
|
|
// "amount of skin" visible in an image. It is not based on the
|
|
// landmarker results, only on the initial face detection, hence
|
|
// the <code>fd</code> (face detection) prefix.
|
|
BoundingPoly fd_bounding_poly = 2;
|
|
|
|
// Detected face landmarks.
|
|
repeated Landmark landmarks = 3;
|
|
|
|
// Roll angle, which indicates the amount of clockwise/anti-clockwise rotation
|
|
// of the face relative to the image vertical about the axis perpendicular to
|
|
// the face. Range [-180,180].
|
|
float roll_angle = 4;
|
|
|
|
// Yaw angle, which indicates the leftward/rightward angle that the face is
|
|
// pointing relative to the vertical plane perpendicular to the image. Range
|
|
// [-180,180].
|
|
float pan_angle = 5;
|
|
|
|
// Pitch angle, which indicates the upwards/downwards angle that the face is
|
|
// pointing relative to the image's horizontal plane. Range [-180,180].
|
|
float tilt_angle = 6;
|
|
|
|
// Detection confidence. Range [0, 1].
|
|
float detection_confidence = 7;
|
|
|
|
// Face landmarking confidence. Range [0, 1].
|
|
float landmarking_confidence = 8;
|
|
|
|
// Joy likelihood.
|
|
Likelihood joy_likelihood = 9;
|
|
|
|
// Sorrow likelihood.
|
|
Likelihood sorrow_likelihood = 10;
|
|
|
|
// Anger likelihood.
|
|
Likelihood anger_likelihood = 11;
|
|
|
|
// Surprise likelihood.
|
|
Likelihood surprise_likelihood = 12;
|
|
|
|
// Under-exposed likelihood.
|
|
Likelihood under_exposed_likelihood = 13;
|
|
|
|
// Blurred likelihood.
|
|
Likelihood blurred_likelihood = 14;
|
|
|
|
// Headwear likelihood.
|
|
Likelihood headwear_likelihood = 15;
|
|
}
|
|
|
|
// Detected entity location information.
|
|
message LocationInfo {
|
|
// lat/long location coordinates.
|
|
google.type.LatLng lat_lng = 1;
|
|
}
|
|
|
|
// A `Property` consists of a user-supplied name/value pair.
|
|
message Property {
|
|
// Name of the property.
|
|
string name = 1;
|
|
|
|
// Value of the property.
|
|
string value = 2;
|
|
|
|
// Value of numeric properties.
|
|
uint64 uint64_value = 3;
|
|
}
|
|
|
|
// Set of detected entity features.
|
|
message EntityAnnotation {
|
|
// Opaque entity ID. Some IDs may be available in
|
|
// [Google Knowledge Graph Search
|
|
// API](https://developers.google.com/knowledge-graph/).
|
|
string mid = 1;
|
|
|
|
// The language code for the locale in which the entity textual
|
|
// `description` is expressed.
|
|
string locale = 2;
|
|
|
|
// Entity textual description, expressed in its `locale` language.
|
|
string description = 3;
|
|
|
|
// Overall score of the result. Range [0, 1].
|
|
float score = 4;
|
|
|
|
// The accuracy of the entity detection in an image.
|
|
// For example, for an image in which the "Eiffel Tower" entity is detected,
|
|
// this field represents the confidence that there is a tower in the query
|
|
// image. Range [0, 1].
|
|
float confidence = 5;
|
|
|
|
// The relevancy of the ICA (Image Content Annotation) label to the
|
|
// image. For example, the relevancy of "tower" is likely higher to an image
|
|
// containing the detected "Eiffel Tower" than to an image containing a
|
|
// detected distant towering building, even though the confidence that
|
|
// there is a tower in each image may be the same. Range [0, 1].
|
|
float topicality = 6;
|
|
|
|
// Image region to which this entity belongs. Not produced
|
|
// for `LABEL_DETECTION` features.
|
|
BoundingPoly bounding_poly = 7;
|
|
|
|
// The location information for the detected entity. Multiple
|
|
// `LocationInfo` elements can be present because one location may
|
|
// indicate the location of the scene in the image, and another location
|
|
// may indicate the location of the place where the image was taken.
|
|
// Location information is usually present for landmarks.
|
|
repeated LocationInfo locations = 8;
|
|
|
|
// Some entities may have optional user-supplied `Property` (name/value)
|
|
// fields, such a score or string that qualifies the entity.
|
|
repeated Property properties = 9;
|
|
}
|
|
|
|
// Set of features pertaining to the image, computed by computer vision
|
|
// methods over safe-search verticals (for example, adult, spoof, medical,
|
|
// violence).
|
|
message SafeSearchAnnotation {
|
|
// Represents the adult content likelihood for the image. Adult content may
|
|
// contain elements such as nudity, pornographic images or cartoons, or
|
|
// sexual activities.
|
|
Likelihood adult = 1;
|
|
|
|
// Spoof likelihood. The likelihood that an modification
|
|
// was made to the image's canonical version to make it appear
|
|
// funny or offensive.
|
|
Likelihood spoof = 2;
|
|
|
|
// Likelihood that this is a medical image.
|
|
Likelihood medical = 3;
|
|
|
|
// Likelihood that this image contains violent content.
|
|
Likelihood violence = 4;
|
|
|
|
// Likelihood that the request image contains racy content. Racy content may
|
|
// include (but is not limited to) skimpy or sheer clothing, strategically
|
|
// covered nudity, lewd or provocative poses, or close-ups of sensitive
|
|
// body areas.
|
|
Likelihood racy = 9;
|
|
}
|
|
|
|
// Rectangle determined by min and max `LatLng` pairs.
|
|
message LatLongRect {
|
|
// Min lat/long pair.
|
|
google.type.LatLng min_lat_lng = 1;
|
|
|
|
// Max lat/long pair.
|
|
google.type.LatLng max_lat_lng = 2;
|
|
}
|
|
|
|
// Color information consists of RGB channels, score, and the fraction of
|
|
// the image that the color occupies in the image.
|
|
message ColorInfo {
|
|
// RGB components of the color.
|
|
google.type.Color color = 1;
|
|
|
|
// Image-specific score for this color. Value in range [0, 1].
|
|
float score = 2;
|
|
|
|
// The fraction of pixels the color occupies in the image.
|
|
// Value in range [0, 1].
|
|
float pixel_fraction = 3;
|
|
}
|
|
|
|
// Set of dominant colors and their corresponding scores.
|
|
message DominantColorsAnnotation {
|
|
// RGB color values with their score and pixel fraction.
|
|
repeated ColorInfo colors = 1;
|
|
}
|
|
|
|
// Stores image properties, such as dominant colors.
|
|
message ImageProperties {
|
|
// If present, dominant colors completed successfully.
|
|
DominantColorsAnnotation dominant_colors = 1;
|
|
}
|
|
|
|
// Single crop hint that is used to generate a new crop when serving an image.
|
|
message CropHint {
|
|
// The bounding polygon for the crop region. The coordinates of the bounding
|
|
// box are in the original image's scale, as returned in `ImageParams`.
|
|
BoundingPoly bounding_poly = 1;
|
|
|
|
// Confidence of this being a salient region. Range [0, 1].
|
|
float confidence = 2;
|
|
|
|
// Fraction of importance of this salient region with respect to the original
|
|
// image.
|
|
float importance_fraction = 3;
|
|
}
|
|
|
|
// Set of crop hints that are used to generate new crops when serving images.
|
|
message CropHintsAnnotation {
|
|
// Crop hint results.
|
|
repeated CropHint crop_hints = 1;
|
|
}
|
|
|
|
// Parameters for crop hints annotation request.
|
|
message CropHintsParams {
|
|
// Aspect ratios in floats, representing the ratio of the width to the height
|
|
// of the image. For example, if the desired aspect ratio is 4/3, the
|
|
// corresponding float value should be 1.33333. If not specified, the
|
|
// best possible crop is returned. The number of provided aspect ratios is
|
|
// limited to a maximum of 16; any aspect ratios provided after the 16th are
|
|
// ignored.
|
|
repeated float aspect_ratios = 1;
|
|
}
|
|
|
|
// Parameters for web detection request.
|
|
message WebDetectionParams {
|
|
// Whether to include results derived from the geo information in the image.
|
|
bool include_geo_results = 2;
|
|
}
|
|
|
|
// Image context and/or feature-specific parameters.
|
|
message ImageContext {
|
|
// lat/long rectangle that specifies the location of the image.
|
|
LatLongRect lat_long_rect = 1;
|
|
|
|
// List of languages to use for TEXT_DETECTION. In most cases, an empty value
|
|
// yields the best results since it enables automatic language detection. For
|
|
// languages based on the Latin alphabet, setting `language_hints` is not
|
|
// needed. In rare cases, when the language of the text in the image is known,
|
|
// setting a hint will help get better results (although it will be a
|
|
// significant hindrance if the hint is wrong). Text detection returns an
|
|
// error if one or more of the specified languages is not one of the
|
|
// [supported languages](/vision/docs/languages).
|
|
repeated string language_hints = 2;
|
|
|
|
// Parameters for crop hints annotation request.
|
|
CropHintsParams crop_hints_params = 4;
|
|
|
|
// Parameters for web detection.
|
|
WebDetectionParams web_detection_params = 6;
|
|
}
|
|
|
|
// Request for performing Google Cloud Vision API tasks over a user-provided
|
|
// image, with user-requested features.
|
|
message AnnotateImageRequest {
|
|
// The image to be processed.
|
|
Image image = 1;
|
|
|
|
// Requested features.
|
|
repeated Feature features = 2;
|
|
|
|
// Additional context that may accompany the image.
|
|
ImageContext image_context = 3;
|
|
}
|
|
|
|
// Response to an image annotation request.
|
|
message AnnotateImageResponse {
|
|
// If present, face detection has completed successfully.
|
|
repeated FaceAnnotation face_annotations = 1;
|
|
|
|
// If present, landmark detection has completed successfully.
|
|
repeated EntityAnnotation landmark_annotations = 2;
|
|
|
|
// If present, logo detection has completed successfully.
|
|
repeated EntityAnnotation logo_annotations = 3;
|
|
|
|
// If present, label detection has completed successfully.
|
|
repeated EntityAnnotation label_annotations = 4;
|
|
|
|
// If present, text (OCR) detection has completed successfully.
|
|
repeated EntityAnnotation text_annotations = 5;
|
|
|
|
// If present, text (OCR) detection or document (OCR) text detection has
|
|
// completed successfully.
|
|
// This annotation provides the structural hierarchy for the OCR detected
|
|
// text.
|
|
TextAnnotation full_text_annotation = 12;
|
|
|
|
// If present, safe-search annotation has completed successfully.
|
|
SafeSearchAnnotation safe_search_annotation = 6;
|
|
|
|
// If present, image properties were extracted successfully.
|
|
ImageProperties image_properties_annotation = 8;
|
|
|
|
// If present, crop hints have completed successfully.
|
|
CropHintsAnnotation crop_hints_annotation = 11;
|
|
|
|
// If present, web detection has completed successfully.
|
|
WebDetection web_detection = 13;
|
|
|
|
// If set, represents the error message for the operation.
|
|
// Note that filled-in image annotations are guaranteed to be
|
|
// correct, even when `error` is set.
|
|
google.rpc.Status error = 9;
|
|
}
|
|
|
|
// Multiple image annotation requests are batched into a single service call.
|
|
message BatchAnnotateImagesRequest {
|
|
// Individual image annotation requests for this batch.
|
|
repeated AnnotateImageRequest requests = 1;
|
|
}
|
|
|
|
// Response to a batch image annotation request.
|
|
message BatchAnnotateImagesResponse {
|
|
// Individual responses to image annotation requests within the batch.
|
|
repeated AnnotateImageResponse responses = 1;
|
|
}
|
|
|
|
// A bucketized representation of likelihood, which is intended to give clients
|
|
// highly stable results across model upgrades.
|
|
enum Likelihood {
|
|
// Unknown likelihood.
|
|
UNKNOWN = 0;
|
|
|
|
// It is very unlikely that the image belongs to the specified vertical.
|
|
VERY_UNLIKELY = 1;
|
|
|
|
// It is unlikely that the image belongs to the specified vertical.
|
|
UNLIKELY = 2;
|
|
|
|
// It is possible that the image belongs to the specified vertical.
|
|
POSSIBLE = 3;
|
|
|
|
// It is likely that the image belongs to the specified vertical.
|
|
LIKELY = 4;
|
|
|
|
// It is very likely that the image belongs to the specified vertical.
|
|
VERY_LIKELY = 5;
|
|
}
|
|
|