// Copyright 2019 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // syntax = "proto3"; package google.cloud.dialogflow.v2; import "google/api/annotations.proto"; import "google/api/resource.proto"; import "google/cloud/dialogflow/v2/audio_config.proto"; import "google/cloud/dialogflow/v2/context.proto"; import "google/cloud/dialogflow/v2/intent.proto"; import "google/cloud/dialogflow/v2/session_entity_type.proto"; import "google/protobuf/struct.proto"; import "google/rpc/status.proto"; import "google/type/latlng.proto"; option cc_enable_arenas = true; option csharp_namespace = "Google.Cloud.Dialogflow.V2"; option go_package = "google.golang.org/genproto/googleapis/cloud/dialogflow/v2;dialogflow"; option java_multiple_files = true; option java_outer_classname = "SessionProto"; option java_package = "com.google.cloud.dialogflow.v2"; option objc_class_prefix = "DF"; // A session represents an interaction with a user. You retrieve user input // and pass it to the [DetectIntent][google.cloud.dialogflow.v2.Sessions.DetectIntent] (or // [StreamingDetectIntent][google.cloud.dialogflow.v2.Sessions.StreamingDetectIntent]) method to determine // user intent and respond. service Sessions { // Processes a natural language query and returns structured, actionable data // as a result. This method is not idempotent, because it may cause contexts // and session entity types to be updated, which in turn might affect // results of future queries. rpc DetectIntent(DetectIntentRequest) returns (DetectIntentResponse) { option (google.api.http) = { post: "/v2/{session=projects/*/agent/sessions/*}:detectIntent" body: "*" }; } // Processes a natural language query in audio format in a streaming fashion // and returns structured, actionable data as a result. This method is only // available via the gRPC API (not REST). rpc StreamingDetectIntent(stream StreamingDetectIntentRequest) returns (stream StreamingDetectIntentResponse) { } } // The request to detect user's intent. message DetectIntentRequest { // Required. The name of the session this query is sent to. Format: // `projects//agent/sessions/`. It's up to the API // caller to choose an appropriate session ID. It can be a random number or // some type of user identifier (preferably hashed). The length of the session // ID must not exceed 36 bytes. string session = 1; // Optional. The parameters of this query. QueryParameters query_params = 2; // Required. The input specification. It can be set to: // // 1. an audio config // which instructs the speech recognizer how to process the speech audio, // // 2. a conversational query in the form of text, or // // 3. an event that specifies which intent to trigger. QueryInput query_input = 3; // Optional. Instructs the speech synthesizer how to generate the output // audio. If this field is not set and agent-level speech synthesizer is not // configured, no output audio is generated. OutputAudioConfig output_audio_config = 4; // Optional. The natural language speech audio to be processed. This field // should be populated iff `query_input` is set to an input audio config. // A single request can contain up to 1 minute of speech audio data. bytes input_audio = 5; } // The message returned from the DetectIntent method. message DetectIntentResponse { // The unique identifier of the response. It can be used to // locate a response in the training example set or for reporting issues. string response_id = 1; // The selected results of the conversational query or event processing. // See `alternative_query_results` for additional potential results. QueryResult query_result = 2; // Specifies the status of the webhook request. google.rpc.Status webhook_status = 3; // The audio data bytes encoded as specified in the request. // Note: The output audio is generated based on the values of default platform // text responses found in the `query_result.fulfillment_messages` field. If // multiple default text responses exist, they will be concatenated when // generating audio. If no default platform text responses exist, the // generated audio content will be empty. bytes output_audio = 4; // The config used by the speech synthesizer to generate the output audio. OutputAudioConfig output_audio_config = 6; } // Represents the parameters of the conversational query. message QueryParameters { // Optional. The time zone of this conversational query from the // [time zone database](https://www.iana.org/time-zones), e.g., // America/New_York, Europe/Paris. If not provided, the time zone specified in // agent settings is used. string time_zone = 1; // Optional. The geo location of this conversational query. google.type.LatLng geo_location = 2; // Optional. The collection of contexts to be activated before this query is // executed. repeated Context contexts = 3; // Optional. Specifies whether to delete all contexts in the current session // before the new ones are activated. bool reset_contexts = 4; // Optional. Additional session entity types to replace or extend developer // entity types with. The entity synonyms apply to all languages and persist // for the session of this query. repeated SessionEntityType session_entity_types = 5; // Optional. This field can be used to pass custom data into the webhook // associated with the agent. Arbitrary JSON objects are supported. google.protobuf.Struct payload = 6; // Optional. Configures the type of sentiment analysis to perform. If not // provided, sentiment analysis is not performed. SentimentAnalysisRequestConfig sentiment_analysis_request_config = 10; } // Represents the query input. It can contain either: // // 1. An audio config which // instructs the speech recognizer how to process the speech audio. // // 2. A conversational query in the form of text,. // // 3. An event that specifies which intent to trigger. message QueryInput { // Required. The input specification. oneof input { // Instructs the speech recognizer how to process the speech audio. InputAudioConfig audio_config = 1; // The natural language text to be processed. TextInput text = 2; // The event to be processed. EventInput event = 3; } } // Represents the result of conversational query or event processing. message QueryResult { // The original conversational query text: // - If natural language text was provided as input, `query_text` contains // a copy of the input. // - If natural language speech audio was provided as input, `query_text` // contains the speech recognition result. If speech recognizer produced // multiple alternatives, a particular one is picked. // - If an event was provided as input, `query_text` is not set. string query_text = 1; // The language that was triggered during intent detection. // See [Language // Support](https://cloud.google.com/dialogflow-enterprise/docs/reference/language) // for a list of the currently supported language codes. string language_code = 15; // The Speech recognition confidence between 0.0 and 1.0. A higher number // indicates an estimated greater likelihood that the recognized words are // correct. The default of 0.0 is a sentinel value indicating that confidence // was not set. // // This field is not guaranteed to be accurate or set. In particular this // field isn't set for StreamingDetectIntent since the streaming endpoint has // separate confidence estimates per portion of the audio in // StreamingRecognitionResult. float speech_recognition_confidence = 2; // The action name from the matched intent. string action = 3; // The collection of extracted parameters. google.protobuf.Struct parameters = 4; // This field is set to: // - `false` if the matched intent has required parameters and not all of // the required parameter values have been collected. // - `true` if all required parameter values have been collected, or if the // matched intent doesn't contain any required parameters. bool all_required_params_present = 5; // The text to be pronounced to the user or shown on the screen. // Note: This is a legacy field, `fulfillment_messages` should be preferred. string fulfillment_text = 6; // The collection of rich messages to present to the user. repeated Intent.Message fulfillment_messages = 7; // If the query was fulfilled by a webhook call, this field is set to the // value of the `source` field returned in the webhook response. string webhook_source = 8; // If the query was fulfilled by a webhook call, this field is set to the // value of the `payload` field returned in the webhook response. google.protobuf.Struct webhook_payload = 9; // The collection of output contexts. If applicable, // `output_contexts.parameters` contains entries with name // `.original` containing the original parameter values // before the query. repeated Context output_contexts = 10; // The intent that matched the conversational query. Some, not // all fields are filled in this message, including but not limited to: // `name`, `display_name` and `webhook_state`. Intent intent = 11; // The intent detection confidence. Values range from 0.0 // (completely uncertain) to 1.0 (completely certain). // If there are `multiple knowledge_answers` messages, this value is set to // the greatest `knowledgeAnswers.match_confidence` value in the list. float intent_detection_confidence = 12; // The free-form diagnostic info. For example, this field could contain // webhook call latency. The string keys of the Struct's fields map can change // without notice. google.protobuf.Struct diagnostic_info = 14; // The sentiment analysis result, which depends on the // `sentiment_analysis_request_config` specified in the request. SentimentAnalysisResult sentiment_analysis_result = 17; } // The top-level message sent by the client to the // `StreamingDetectIntent` method. // // Multiple request messages should be sent in order: // // 1. The first message must contain `session`, `query_input` plus optionally // `query_params` and/or `single_utterance`. The message must not contain `input_audio`. // // 2. If `query_input` was set to a streaming input audio config, // all subsequent messages must contain only `input_audio`. // Otherwise, finish the request stream. message StreamingDetectIntentRequest { // Required. The name of the session the query is sent to. // Format of the session name: // `projects//agent/sessions/`. It’s up to the API // caller to choose an appropriate `Session ID`. It can be a random number or // some type of user identifier (preferably hashed). The length of the session // ID must not exceed 36 characters. string session = 1; // Optional. The parameters of this query. QueryParameters query_params = 2; // Required. The input specification. It can be set to: // // 1. an audio config which instructs the speech recognizer how to process // the speech audio, // // 2. a conversational query in the form of text, or // // 3. an event that specifies which intent to trigger. QueryInput query_input = 3; // Optional. If `false` (default), recognition does not cease until the // client closes the stream. // If `true`, the recognizer will detect a single spoken utterance in input // audio. Recognition ceases when it detects the audio's voice has // stopped or paused. In this case, once a detected intent is received, the // client should close the stream and start a new request with a new stream as // needed. // This setting is ignored when `query_input` is a piece of text or an event. bool single_utterance = 4; // Optional. Instructs the speech synthesizer how to generate the output // audio. If this field is not set and agent-level speech synthesizer is not // configured, no output audio is generated. OutputAudioConfig output_audio_config = 5; // Optional. The input audio content to be recognized. Must be sent if // `query_input` was set to a streaming input audio config. The complete audio // over all streaming messages must not exceed 1 minute. bytes input_audio = 6; } // The top-level message returned from the // `StreamingDetectIntent` method. // // Multiple response messages can be returned in order: // // 1. If the input was set to streaming audio, the first one or more messages // contain `recognition_result`. Each `recognition_result` represents a more // complete transcript of what the user said. The last `recognition_result` // has `is_final` set to `true`. // // 2. The next message contains `response_id`, `query_result` // and optionally `webhook_status` if a WebHook was called. message StreamingDetectIntentResponse { // The unique identifier of the response. It can be used to // locate a response in the training example set or for reporting issues. string response_id = 1; // The result of speech recognition. StreamingRecognitionResult recognition_result = 2; // The result of the conversational query or event processing. QueryResult query_result = 3; // Specifies the status of the webhook request. google.rpc.Status webhook_status = 4; // The audio data bytes encoded as specified in the request. bytes output_audio = 5; // Instructs the speech synthesizer how to generate the output audio. This // field is populated from the agent-level speech synthesizer configuration, // if enabled. OutputAudioConfig output_audio_config = 6; } // Contains a speech recognition result corresponding to a portion of the audio // that is currently being processed or an indication that this is the end // of the single requested utterance. // // Example: // // 1. transcript: "tube" // // 2. transcript: "to be a" // // 3. transcript: "to be" // // 4. transcript: "to be or not to be" // is_final: true // // 5. transcript: " that's" // // 6. transcript: " that is" // // 7. message_type: `MESSAGE_TYPE_END_OF_SINGLE_UTTERANCE` // // 8. transcript: " that is the question" // is_final: true // // Only two of the responses contain final results (#4 and #8 indicated by // `is_final: true`). Concatenating these generates the full transcript: "to be // or not to be that is the question". // // In each response we populate: // // * for `MESSAGE_TYPE_TRANSCRIPT`: `transcript` and possibly `is_final`. // // * for `MESSAGE_TYPE_END_OF_SINGLE_UTTERANCE`: only `message_type`. message StreamingRecognitionResult { // Type of the response message. enum MessageType { // Not specified. Should never be used. MESSAGE_TYPE_UNSPECIFIED = 0; // Message contains a (possibly partial) transcript. TRANSCRIPT = 1; // Event indicates that the server has detected the end of the user's speech // utterance and expects no additional speech. Therefore, the server will // not process additional audio (although it may subsequently return // additional results). The client should stop sending additional audio // data, half-close the gRPC connection, and wait for any additional results // until the server closes the gRPC connection. This message is only sent if // `single_utterance` was set to `true`, and is not used otherwise. END_OF_SINGLE_UTTERANCE = 2; } // Type of the result message. MessageType message_type = 1; // Transcript text representing the words that the user spoke. // Populated if and only if `message_type` = `MESSAGE_TYPE_TRANSCRIPT`. string transcript = 2; // If `false`, the `StreamingRecognitionResult` represents an // interim result that may change. If `true`, the recognizer will not return // any further hypotheses about this piece of the audio. May only be populated // for `message_type` = `MESSAGE_TYPE_TRANSCRIPT`. bool is_final = 3; // The Speech confidence between 0.0 and 1.0 for the current portion of audio. // A higher number indicates an estimated greater likelihood that the // recognized words are correct. The default of 0.0 is a sentinel value // indicating that confidence was not set. // // This field is typically only provided if `is_final` is true and you should // not rely on it being accurate or even set. float confidence = 4; } // Represents the natural language text to be processed. message TextInput { // Required. The UTF-8 encoded natural language text to be processed. // Text length must not exceed 256 characters. string text = 1; // Required. The language of this conversational query. See [Language // Support](https://cloud.google.com/dialogflow-enterprise/docs/reference/language) // for a list of the currently supported language codes. Note that queries in // the same session do not necessarily need to specify the same language. string language_code = 2; } // Events allow for matching intents by event name instead of the natural // language input. For instance, input `` can trigger a personalized welcome response. // The parameter `name` may be used by the agent in the response: // `"Hello #welcome_event.name! What can I do for you today?"`. message EventInput { // Required. The unique identifier of the event. string name = 1; // Optional. The collection of parameters associated with the event. google.protobuf.Struct parameters = 2; // Required. The language of this query. See [Language // Support](https://cloud.google.com/dialogflow-enterprise/docs/reference/language) // for a list of the currently supported language codes. Note that queries in // the same session do not necessarily need to specify the same language. string language_code = 3; } // Configures the types of sentiment analysis to perform. message SentimentAnalysisRequestConfig { // Optional. Instructs the service to perform sentiment analysis on // `query_text`. If not provided, sentiment analysis is not performed on // `query_text`. bool analyze_query_text_sentiment = 1; } // The result of sentiment analysis as configured by // `sentiment_analysis_request_config`. message SentimentAnalysisResult { // The sentiment analysis result for `query_text`. Sentiment query_text_sentiment = 1; } // The sentiment, such as positive/negative feeling or association, for a unit // of analysis, such as the query text. message Sentiment { // Sentiment score between -1.0 (negative sentiment) and 1.0 (positive // sentiment). float score = 1; // A non-negative number in the [0, +inf) range, which represents the absolute // magnitude of sentiment, regardless of score (positive or negative). float magnitude = 2; }