You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
467 lines
19 KiB
467 lines
19 KiB
6 years ago
|
// Copyright 2018 Google Inc.
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
syntax = "proto3";
|
||
|
|
||
|
package google.assistant.embedded.v1alpha2;
|
||
|
|
||
|
import "google/api/annotations.proto";
|
||
|
import "google/type/latlng.proto";
|
||
|
|
||
|
option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha2;embedded";
|
||
|
option java_multiple_files = true;
|
||
|
option java_outer_classname = "AssistantProto";
|
||
|
option java_package = "com.google.assistant.embedded.v1alpha2";
|
||
|
option objc_class_prefix = "ASTSDK";
|
||
|
|
||
|
// Service that implements the Google Assistant API.
|
||
|
service EmbeddedAssistant {
|
||
|
// Initiates or continues a conversation with the embedded Assistant Service.
|
||
|
// Each call performs one round-trip, sending an audio request to the service
|
||
|
// and receiving the audio response. Uses bidirectional streaming to receive
|
||
|
// results, such as the `END_OF_UTTERANCE` event, while sending audio.
|
||
|
//
|
||
|
// A conversation is one or more gRPC connections, each consisting of several
|
||
|
// streamed requests and responses.
|
||
|
// For example, the user says *Add to my shopping list* and the Assistant
|
||
|
// responds *What do you want to add?*. The sequence of streamed requests and
|
||
|
// responses in the first gRPC message could be:
|
||
|
//
|
||
|
// * AssistRequest.config
|
||
|
// * AssistRequest.audio_in
|
||
|
// * AssistRequest.audio_in
|
||
|
// * AssistRequest.audio_in
|
||
|
// * AssistRequest.audio_in
|
||
|
// * AssistResponse.event_type.END_OF_UTTERANCE
|
||
|
// * AssistResponse.speech_results.transcript "add to my shopping list"
|
||
|
// * AssistResponse.dialog_state_out.microphone_mode.DIALOG_FOLLOW_ON
|
||
|
// * AssistResponse.audio_out
|
||
|
// * AssistResponse.audio_out
|
||
|
// * AssistResponse.audio_out
|
||
|
//
|
||
|
//
|
||
|
// The user then says *bagels* and the Assistant responds
|
||
|
// *OK, I've added bagels to your shopping list*. This is sent as another gRPC
|
||
|
// connection call to the `Assist` method, again with streamed requests and
|
||
|
// responses, such as:
|
||
|
//
|
||
|
// * AssistRequest.config
|
||
|
// * AssistRequest.audio_in
|
||
|
// * AssistRequest.audio_in
|
||
|
// * AssistRequest.audio_in
|
||
|
// * AssistResponse.event_type.END_OF_UTTERANCE
|
||
|
// * AssistResponse.dialog_state_out.microphone_mode.CLOSE_MICROPHONE
|
||
|
// * AssistResponse.audio_out
|
||
|
// * AssistResponse.audio_out
|
||
|
// * AssistResponse.audio_out
|
||
|
// * AssistResponse.audio_out
|
||
|
//
|
||
|
// Although the precise order of responses is not guaranteed, sequential
|
||
|
// `AssistResponse.audio_out` messages will always contain sequential portions
|
||
|
// of audio.
|
||
|
rpc Assist(stream AssistRequest) returns (stream AssistResponse);
|
||
|
}
|
||
|
|
||
|
// The top-level message sent by the client. Clients must send at least two, and
|
||
|
// typically numerous `AssistRequest` messages. The first message must
|
||
|
// contain a `config` message and must not contain `audio_in` data. All
|
||
|
// subsequent messages must contain `audio_in` data and must not contain a
|
||
|
// `config` message.
|
||
|
message AssistRequest {
|
||
|
// Exactly one of these fields must be specified in each `AssistRequest`.
|
||
|
oneof type {
|
||
|
// The `config` message provides information to the recognizer that
|
||
|
// specifies how to process the request.
|
||
|
// The first `AssistRequest` message must contain a `config` message.
|
||
|
AssistConfig config = 1;
|
||
|
|
||
|
// The audio data to be recognized. Sequential chunks of audio data are sent
|
||
|
// in sequential `AssistRequest` messages. The first `AssistRequest`
|
||
|
// message must not contain `audio_in` data and all subsequent
|
||
|
// `AssistRequest` messages must contain `audio_in` data. The audio bytes
|
||
|
// must be encoded as specified in `AudioInConfig`.
|
||
|
// Audio must be sent at approximately real-time (16000 samples per second).
|
||
|
// An error will be returned if audio is sent significantly faster or
|
||
|
// slower.
|
||
|
bytes audio_in = 2;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// The top-level message received by the client. A series of one or more
|
||
|
// `AssistResponse` messages are streamed back to the client.
|
||
|
message AssistResponse {
|
||
|
// Indicates the type of event.
|
||
|
enum EventType {
|
||
|
// No event specified.
|
||
|
EVENT_TYPE_UNSPECIFIED = 0;
|
||
|
|
||
|
// This event indicates that the server has detected the end of the user's
|
||
|
// speech utterance and expects no additional speech. Therefore, the server
|
||
|
// will not process additional audio (although it may subsequently return
|
||
|
// additional results). The client should stop sending additional audio
|
||
|
// data, half-close the gRPC connection, and wait for any additional results
|
||
|
// until the server closes the gRPC connection.
|
||
|
END_OF_UTTERANCE = 1;
|
||
|
}
|
||
|
|
||
|
// *Output-only* Indicates the type of event.
|
||
|
EventType event_type = 1;
|
||
|
|
||
|
// *Output-only* The audio containing the Assistant's response to the query.
|
||
|
AudioOut audio_out = 3;
|
||
|
|
||
|
// *Output-only* Contains the Assistant's visual response to the query.
|
||
|
ScreenOut screen_out = 4;
|
||
|
|
||
|
// *Output-only* Contains the action triggered by the query with the
|
||
|
// appropriate payloads and semantic parsing.
|
||
|
DeviceAction device_action = 6;
|
||
|
|
||
|
// *Output-only* This repeated list contains zero or more speech recognition
|
||
|
// results that correspond to consecutive portions of the audio currently
|
||
|
// being processed, starting with the portion corresponding to the earliest
|
||
|
// audio (and most stable portion) to the portion corresponding to the most
|
||
|
// recent audio. The strings can be concatenated to view the full
|
||
|
// in-progress response. When the speech recognition completes, this list
|
||
|
// will contain one item with `stability` of `1.0`.
|
||
|
repeated SpeechRecognitionResult speech_results = 2;
|
||
|
|
||
|
// *Output-only* Contains output related to the user's query.
|
||
|
DialogStateOut dialog_state_out = 5;
|
||
|
|
||
|
// *Output-only* Debugging info for developer. Only returned if request set
|
||
|
// `return_debug_info` to true.
|
||
|
DebugInfo debug_info = 8;
|
||
|
}
|
||
|
|
||
|
// Debug info for developer. Only returned if request set `return_debug_info`
|
||
|
// to true.
|
||
|
message DebugInfo {
|
||
|
// The original JSON response from an Action-on-Google agent to Google server.
|
||
|
// See
|
||
|
// https://developers.google.com/actions/reference/rest/Shared.Types/AppResponse.
|
||
|
// It will only be populated if the request maker owns the AoG project and the
|
||
|
// AoG project is in preview mode.
|
||
|
string aog_agent_to_assistant_json = 1;
|
||
|
}
|
||
|
|
||
|
// Specifies how to process the `AssistRequest` messages.
|
||
|
message AssistConfig {
|
||
|
oneof type {
|
||
|
// Specifies how to process the subsequent incoming audio. Required if
|
||
|
// [AssistRequest.audio_in][google.assistant.embedded.v1alpha2.AssistRequest.audio_in]
|
||
|
// bytes will be provided in subsequent requests.
|
||
|
AudioInConfig audio_in_config = 1;
|
||
|
|
||
|
// The text input to be sent to the Assistant. This can be populated from a
|
||
|
// text interface if audio input is not available.
|
||
|
string text_query = 6;
|
||
|
}
|
||
|
|
||
|
// *Required* Specifies how to format the audio that will be returned.
|
||
|
AudioOutConfig audio_out_config = 2;
|
||
|
|
||
|
// *Optional* Specifies the desired format to use when server returns a
|
||
|
// visual screen response.
|
||
|
ScreenOutConfig screen_out_config = 8;
|
||
|
|
||
|
// *Required* Represents the current dialog state.
|
||
|
DialogStateIn dialog_state_in = 3;
|
||
|
|
||
|
// Device configuration that uniquely identifies a specific device.
|
||
|
DeviceConfig device_config = 4;
|
||
|
|
||
|
// *Optional* Debugging parameters for the whole `Assist` RPC.
|
||
|
DebugConfig debug_config = 5;
|
||
|
}
|
||
|
|
||
|
// Specifies how to process the `audio_in` data that will be provided in
|
||
|
// subsequent requests. For recommended settings, see the Google Assistant SDK
|
||
|
// [best
|
||
|
// practices](https://developers.google.com/assistant/sdk/guides/service/python/best-practices/audio).
|
||
|
message AudioInConfig {
|
||
|
// Audio encoding of the data sent in the audio message.
|
||
|
// Audio must be one-channel (mono).
|
||
|
enum Encoding {
|
||
|
// Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
|
||
|
ENCODING_UNSPECIFIED = 0;
|
||
|
|
||
|
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
|
||
|
// This encoding includes no header, only the raw audio bytes.
|
||
|
LINEAR16 = 1;
|
||
|
|
||
|
// [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
|
||
|
// Codec) is the recommended encoding because it is
|
||
|
// lossless--therefore recognition is not compromised--and
|
||
|
// requires only about half the bandwidth of `LINEAR16`. This encoding
|
||
|
// includes the `FLAC` stream header followed by audio data. It supports
|
||
|
// 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are
|
||
|
// supported.
|
||
|
FLAC = 2;
|
||
|
}
|
||
|
|
||
|
// *Required* Encoding of audio data sent in all `audio_in` messages.
|
||
|
Encoding encoding = 1;
|
||
|
|
||
|
// *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in`
|
||
|
// messages. Valid values are from 16000-24000, but 16000 is optimal.
|
||
|
// For best results, set the sampling rate of the audio source to 16000 Hz.
|
||
|
// If that's not possible, use the native sample rate of the audio source
|
||
|
// (instead of re-sampling).
|
||
|
int32 sample_rate_hertz = 2;
|
||
|
}
|
||
|
|
||
|
// Specifies the desired format for the server to use when it returns
|
||
|
// `audio_out` messages.
|
||
|
message AudioOutConfig {
|
||
|
// Audio encoding of the data returned in the audio message. All encodings are
|
||
|
// raw audio bytes with no header, except as indicated below.
|
||
|
enum Encoding {
|
||
|
// Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
|
||
|
ENCODING_UNSPECIFIED = 0;
|
||
|
|
||
|
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
|
||
|
LINEAR16 = 1;
|
||
|
|
||
|
// MP3 audio encoding. The sample rate is encoded in the payload.
|
||
|
MP3 = 2;
|
||
|
|
||
|
// Opus-encoded audio wrapped in an ogg container. The result will be a
|
||
|
// file which can be played natively on Android and in some browsers (such
|
||
|
// as Chrome). The quality of the encoding is considerably higher than MP3
|
||
|
// while using the same bitrate. The sample rate is encoded in the payload.
|
||
|
OPUS_IN_OGG = 3;
|
||
|
}
|
||
|
|
||
|
// *Required* The encoding of audio data to be returned in all `audio_out`
|
||
|
// messages.
|
||
|
Encoding encoding = 1;
|
||
|
|
||
|
// *Required* The sample rate in Hertz of the audio data returned in
|
||
|
// `audio_out` messages. Valid values are: 16000-24000.
|
||
|
int32 sample_rate_hertz = 2;
|
||
|
|
||
|
// *Required* Current volume setting of the device's audio output.
|
||
|
// Valid values are 1 to 100 (corresponding to 1% to 100%).
|
||
|
int32 volume_percentage = 3;
|
||
|
}
|
||
|
|
||
|
// Specifies the desired format for the server to use when it returns
|
||
|
// `screen_out` response.
|
||
|
message ScreenOutConfig {
|
||
|
// Possible modes for visual screen-output on the device.
|
||
|
enum ScreenMode {
|
||
|
// No video mode specified.
|
||
|
// The Assistant may respond as if in `OFF` mode.
|
||
|
SCREEN_MODE_UNSPECIFIED = 0;
|
||
|
|
||
|
// Screen is off (or has brightness or other settings set so low it is
|
||
|
// not visible). The Assistant will typically not return a screen response
|
||
|
// in this mode.
|
||
|
OFF = 1;
|
||
|
|
||
|
// The Assistant will typically return a partial-screen response in this
|
||
|
// mode.
|
||
|
PLAYING = 3;
|
||
|
}
|
||
|
|
||
|
// Current visual screen-mode for the device while issuing the query.
|
||
|
ScreenMode screen_mode = 1;
|
||
|
}
|
||
|
|
||
|
// Provides information about the current dialog state.
|
||
|
message DialogStateIn {
|
||
|
// *Required* This field must always be set to the
|
||
|
// [DialogStateOut.conversation_state][google.assistant.embedded.v1alpha2.DialogStateOut.conversation_state]
|
||
|
// value that was returned in the prior `Assist` RPC. It should only be
|
||
|
// omitted (field not set) if there was no prior `Assist` RPC because this is
|
||
|
// the first `Assist` RPC made by this device after it was first setup and/or
|
||
|
// a factory-default reset.
|
||
|
bytes conversation_state = 1;
|
||
|
|
||
|
// *Required* Language of the request in
|
||
|
// [IETF BCP 47 syntax](https://tools.ietf.org/html/bcp47) (for example,
|
||
|
// "en-US"). See [Language
|
||
|
// Support](https://developers.google.com/assistant/sdk/reference/rpc/languages)
|
||
|
// for more information. If you have selected a language for this `device_id`
|
||
|
// using the
|
||
|
// [Settings](https://developers.google.com/assistant/sdk/reference/assistant-app/assistant-settings)
|
||
|
// menu in your phone's Google Assistant app, that selection will override
|
||
|
// this value.
|
||
|
string language_code = 2;
|
||
|
|
||
|
// *Optional* Location of the device where the query originated.
|
||
|
DeviceLocation device_location = 5;
|
||
|
|
||
|
// *Optional* If true, the server will treat the request as a new conversation
|
||
|
// and not use state from the prior request. Set this field to true when the
|
||
|
// conversation should be restarted, such as after a device reboot, or after a
|
||
|
// significant lapse of time since the prior query.
|
||
|
bool is_new_conversation = 7;
|
||
|
}
|
||
|
|
||
|
// *Required* Fields that identify the device to the Assistant.
|
||
|
//
|
||
|
// See also:
|
||
|
//
|
||
|
// * [Register a Device - REST
|
||
|
// API](https://developers.google.com/assistant/sdk/reference/device-registration/register-device-manual)
|
||
|
// * [Device Model and Instance
|
||
|
// Schemas](https://developers.google.com/assistant/sdk/reference/device-registration/model-and-instance-schemas)
|
||
|
// * [Device
|
||
|
// Proto](https://developers.google.com/assistant/sdk/reference/rpc/google.assistant.devices.v1alpha2#device)
|
||
|
message DeviceConfig {
|
||
|
// *Required* Unique identifier for the device. The id length must be 128
|
||
|
// characters or less. Example: DBCDW098234. This MUST match the device_id
|
||
|
// returned from device registration. This device_id is used to match against
|
||
|
// the user's registered devices to lookup the supported traits and
|
||
|
// capabilities of this device. This information should not change across
|
||
|
// device reboots. However, it should not be saved across
|
||
|
// factory-default resets.
|
||
|
string device_id = 1;
|
||
|
|
||
|
// *Required* Unique identifier for the device model. The combination of
|
||
|
// device_model_id and device_id must have been previously associated through
|
||
|
// device registration.
|
||
|
string device_model_id = 3;
|
||
|
}
|
||
|
|
||
|
// The audio containing the Assistant's response to the query. Sequential chunks
|
||
|
// of audio data are received in sequential `AssistResponse` messages.
|
||
|
message AudioOut {
|
||
|
// *Output-only* The audio data containing the Assistant's response to the
|
||
|
// query. Sequential chunks of audio data are received in sequential
|
||
|
// `AssistResponse` messages.
|
||
|
bytes audio_data = 1;
|
||
|
}
|
||
|
|
||
|
// The Assistant's visual output response to query. Enabled by
|
||
|
// `screen_out_config`.
|
||
|
message ScreenOut {
|
||
|
// Possible formats of the screen data.
|
||
|
enum Format {
|
||
|
// No format specified.
|
||
|
FORMAT_UNSPECIFIED = 0;
|
||
|
|
||
|
// Data will contain a fully-formed HTML5 layout encoded in UTF-8, e.g.
|
||
|
// `<html><body><div>...</div></body></html>`. It is intended to be rendered
|
||
|
// along with the audio response. Note that HTML5 doctype should be included
|
||
|
// in the actual HTML data.
|
||
|
HTML = 1;
|
||
|
}
|
||
|
|
||
|
// *Output-only* The format of the provided screen data.
|
||
|
Format format = 1;
|
||
|
|
||
|
// *Output-only* The raw screen data to be displayed as the result of the
|
||
|
// Assistant query.
|
||
|
bytes data = 2;
|
||
|
}
|
||
|
|
||
|
// The response returned to the device if the user has triggered a Device
|
||
|
// Action. For example, a device which supports the query *Turn on the light*
|
||
|
// would receive a `DeviceAction` with a JSON payload containing the semantics
|
||
|
// of the request.
|
||
|
message DeviceAction {
|
||
|
// JSON containing the device command response generated from the triggered
|
||
|
// Device Action grammar. The format is given by the
|
||
|
// `action.devices.EXECUTE` intent for a given
|
||
|
// [trait](https://developers.google.com/assistant/sdk/reference/traits/).
|
||
|
string device_request_json = 1;
|
||
|
}
|
||
|
|
||
|
// The estimated transcription of a phrase the user has spoken. This could be
|
||
|
// a single segment or the full guess of the user's spoken query.
|
||
|
message SpeechRecognitionResult {
|
||
|
// *Output-only* Transcript text representing the words that the user spoke.
|
||
|
string transcript = 1;
|
||
|
|
||
|
// *Output-only* An estimate of the likelihood that the Assistant will not
|
||
|
// change its guess about this result. Values range from 0.0 (completely
|
||
|
// unstable) to 1.0 (completely stable and final). The default of 0.0 is a
|
||
|
// sentinel value indicating `stability` was not set.
|
||
|
float stability = 2;
|
||
|
}
|
||
|
|
||
|
// The dialog state resulting from the user's query. Multiple of these messages
|
||
|
// may be received.
|
||
|
message DialogStateOut {
|
||
|
// Possible states of the microphone after a `Assist` RPC completes.
|
||
|
enum MicrophoneMode {
|
||
|
// No mode specified.
|
||
|
MICROPHONE_MODE_UNSPECIFIED = 0;
|
||
|
|
||
|
// The service is not expecting a follow-on question from the user.
|
||
|
// The microphone should remain off until the user re-activates it.
|
||
|
CLOSE_MICROPHONE = 1;
|
||
|
|
||
|
// The service is expecting a follow-on question from the user. The
|
||
|
// microphone should be re-opened when the `AudioOut` playback completes
|
||
|
// (by starting a new `Assist` RPC call to send the new audio).
|
||
|
DIALOG_FOLLOW_ON = 2;
|
||
|
}
|
||
|
|
||
|
// *Output-only* Supplemental display text from the Assistant. This could be
|
||
|
// the same as the speech spoken in `AssistResponse.audio_out` or it could
|
||
|
// be some additional information which aids the user's understanding.
|
||
|
string supplemental_display_text = 1;
|
||
|
|
||
|
// *Output-only* State information for the subsequent `Assist` RPC. This
|
||
|
// value should be saved in the client and returned in the
|
||
|
// [`DialogStateIn.conversation_state`](#dialogstatein) field with the next
|
||
|
// `Assist` RPC. (The client does not need to interpret or otherwise use this
|
||
|
// value.) This information should be saved across device reboots. However,
|
||
|
// this value should be cleared (not saved in the client) during a
|
||
|
// factory-default reset.
|
||
|
bytes conversation_state = 2;
|
||
|
|
||
|
// *Output-only* Specifies the mode of the microphone after this `Assist`
|
||
|
// RPC is processed.
|
||
|
MicrophoneMode microphone_mode = 3;
|
||
|
|
||
|
// *Output-only* Updated volume level. The value will be 0 or omitted
|
||
|
// (indicating no change) unless a voice command such as *Increase the volume*
|
||
|
// or *Set volume level 4* was recognized, in which case the value will be
|
||
|
// between 1 and 100 (corresponding to the new volume level of 1% to 100%).
|
||
|
// Typically, a client should use this volume level when playing the
|
||
|
// `audio_out` data, and retain this value as the current volume level and
|
||
|
// supply it in the `AudioOutConfig` of the next `AssistRequest`. (Some
|
||
|
// clients may also implement other ways to allow the current volume level to
|
||
|
// be changed, for example, by providing a knob that the user can turn.)
|
||
|
int32 volume_percentage = 4;
|
||
|
}
|
||
|
|
||
|
// Debugging parameters for the current request.
|
||
|
message DebugConfig {
|
||
|
// When this field is set to true, the `debug_info` field in `AssistResponse`
|
||
|
// may be populated. However it will significantly increase latency of
|
||
|
// responses. Do not set this field true in production code.
|
||
|
bool return_debug_info = 6;
|
||
|
}
|
||
|
|
||
|
// There are three sources of locations. They are used with this precedence:
|
||
|
//
|
||
|
// 1. This `DeviceLocation`, which is primarily used for mobile devices with
|
||
|
// GPS .
|
||
|
// 2. Location specified by the user during device setup; this is per-user, per
|
||
|
// device. This location is used if `DeviceLocation` is not specified.
|
||
|
// 3. Inferred location based on IP address. This is used only if neither of the
|
||
|
// above are specified.
|
||
|
message DeviceLocation {
|
||
|
oneof type {
|
||
|
// Latitude and longitude of device.
|
||
|
google.type.LatLng coordinates = 1;
|
||
|
}
|
||
|
}
|