// Copyright 2018 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.dialogflow.v2beta1; import "google/api/annotations.proto"; import "google/cloud/dialogflow/v2beta1/audio_config.proto"; import "google/cloud/dialogflow/v2beta1/context.proto"; import "google/cloud/dialogflow/v2beta1/intent.proto"; import "google/cloud/dialogflow/v2beta1/session_entity_type.proto"; import "google/protobuf/struct.proto"; import "google/rpc/status.proto"; import "google/type/latlng.proto"; option cc_enable_arenas = true; option csharp_namespace = "Google.Cloud.Dialogflow.V2beta1"; option go_package = "google.golang.org/genproto/googleapis/cloud/dialogflow/v2beta1;dialogflow"; option java_multiple_files = true; option java_outer_classname = "SessionProto"; option java_package = "com.google.cloud.dialogflow.v2beta1"; option objc_class_prefix = "DF"; // A session represents an interaction with a user. You retrieve user input // and pass it to the // [DetectIntent][google.cloud.dialogflow.v2beta1.Sessions.DetectIntent] (or // [StreamingDetectIntent][google.cloud.dialogflow.v2beta1.Sessions.StreamingDetectIntent]) // method to determine user intent and respond. service Sessions { // Processes a natural language query and returns structured, actionable data // as a result. This method is not idempotent, because it may cause contexts // and session entity types to be updated, which in turn might affect // results of future queries. rpc DetectIntent(DetectIntentRequest) returns (DetectIntentResponse) { option (google.api.http) = { post: "/v2beta1/{session=projects/*/agent/sessions/*}:detectIntent" body: "*" additional_bindings { post: "/v2beta1/{session=projects/*/agent/environments/*/users/*/sessions/*}:detectIntent" body: "*" } }; } // Processes a natural language query in audio format in a streaming fashion // and returns structured, actionable data as a result. This method is only // available via the gRPC API (not REST). rpc StreamingDetectIntent(stream StreamingDetectIntentRequest) returns (stream StreamingDetectIntentResponse) {} } // The request to detect user's intent. message DetectIntentRequest { // Required. The name of the session this query is sent to. Format: // `projects//agent/sessions/`, or // `projects//agent/environments//users//sessions/`. If `Environment ID` is not specified, we assume // default 'draft' environment. If `User ID` is not specified, we are using // "-". It’s up to the API caller to choose an appropriate `Session ID` and // `User Id`. They can be a random numbers or some type of user and session // identifiers (preferably hashed). The length of the `Session ID` and // `User ID` must not exceed 36 characters. string session = 1; // Optional. The parameters of this query. QueryParameters query_params = 2; // Required. The input specification. It can be set to: // // 1. an audio config // which instructs the speech recognizer how to process the speech audio, // // 2. a conversational query in the form of text, or // // 3. an event that specifies which intent to trigger. QueryInput query_input = 3; // Optional. Instructs the speech synthesizer how to generate the output // audio. If this field is not set and agent-level speech synthesizer is not // configured, no output audio is generated. OutputAudioConfig output_audio_config = 4; // Optional. The natural language speech audio to be processed. This field // should be populated iff `query_input` is set to an input audio config. // A single request can contain up to 1 minute of speech audio data. bytes input_audio = 5; } // The message returned from the DetectIntent method. message DetectIntentResponse { // The unique identifier of the response. It can be used to // locate a response in the training example set or for reporting issues. string response_id = 1; // The selected results of the conversational query or event processing. // See `alternative_query_results` for additional potential results. QueryResult query_result = 2; // If Knowledge Connectors are enabled, there could be more than one result // returned for a given query or event, and this field will contain all // results except for the top one, which is captured in query_result. The // alternative results are ordered by decreasing // `QueryResult.intent_detection_confidence`. If Knowledge Connectors are // disabled, this field will be empty until multiple responses for regular // intents are supported, at which point those additional results will be // surfaced here. repeated QueryResult alternative_query_results = 5; // Specifies the status of the webhook request. `webhook_status` // is never populated in webhook requests. google.rpc.Status webhook_status = 3; // The audio data bytes encoded as specified in the request. bytes output_audio = 4; // Instructs the speech synthesizer how to generate the output audio. This // field is populated from the agent-level speech synthesizer configuration, // if enabled. OutputAudioConfig output_audio_config = 6; } // Represents the parameters of the conversational query. message QueryParameters { // Optional. The time zone of this conversational query from the // [time zone database](https://www.iana.org/time-zones), e.g., // America/New_York, Europe/Paris. If not provided, the time zone specified in // agent settings is used. string time_zone = 1; // Optional. The geo location of this conversational query. google.type.LatLng geo_location = 2; // Optional. The collection of contexts to be activated before this query is // executed. repeated Context contexts = 3; // Optional. Specifies whether to delete all contexts in the current session // before the new ones are activated. bool reset_contexts = 4; // Optional. The collection of session entity types to replace or extend // developer entities with for this query only. The entity synonyms apply // to all languages. repeated SessionEntityType session_entity_types = 5; // Optional. This field can be used to pass custom data into the webhook // associated with the agent. Arbitrary JSON objects are supported. google.protobuf.Struct payload = 6; // Optional. KnowledgeBases to get alternative results from. If not set, the // KnowledgeBases enabled in the agent (through UI) will be used. // Format: `projects//knowledgeBases/`. // // Note: This field is `repeated` for forward compatibility, currently only // the first one is supported, we may return an error if multiple // KnowledgeBases are specified. repeated string knowledge_base_names = 12; // Optional. Configures the type of sentiment analysis to perform. If not // provided, sentiment analysis is not performed. // Note: Sentiment Analysis is only currently available for Enterprise Edition // agents. SentimentAnalysisRequestConfig sentiment_analysis_request_config = 10; } // Represents the query input. It can contain either: // // 1. An audio config which // instructs the speech recognizer how to process the speech audio. // // 2. A conversational query in the form of text,. // // 3. An event that specifies which intent to trigger. message QueryInput { // Required. The input specification. oneof input { // Instructs the speech recognizer how to process the speech audio. InputAudioConfig audio_config = 1; // The natural language text to be processed. TextInput text = 2; // The event to be processed. EventInput event = 3; } } // Represents the result of conversational query or event processing. message QueryResult { // The original conversational query text: // - If natural language text was provided as input, `query_text` contains // a copy of the input. // - If natural language speech audio was provided as input, `query_text` // contains the speech recognition result. If speech recognizer produced // multiple alternatives, a particular one is picked. // - If an event was provided as input, `query_text` is not set. string query_text = 1; // The language that was triggered during intent detection. // See [Language Support](https://dialogflow.com/docs/reference/language) // for a list of the currently supported language codes. string language_code = 15; // The Speech recognition confidence between 0.0 and 1.0. A higher number // indicates an estimated greater likelihood that the recognized words are // correct. The default of 0.0 is a sentinel value indicating that confidence // was not set. // // This field is not guaranteed to be accurate or set. In particular this // field isn't set for StreamingDetectIntent since the streaming endpoint has // separate confidence estimates per portion of the audio in // StreamingRecognitionResult. float speech_recognition_confidence = 2; // The action name from the matched intent. string action = 3; // The collection of extracted parameters. google.protobuf.Struct parameters = 4; // This field is set to: // - `false` if the matched intent has required parameters and not all of // the required parameter values have been collected. // - `true` if all required parameter values have been collected, or if the // matched intent doesn't contain any required parameters. bool all_required_params_present = 5; // The text to be pronounced to the user or shown on the screen. string fulfillment_text = 6; // The collection of rich messages to present to the user. repeated Intent.Message fulfillment_messages = 7; // If the query was fulfilled by a webhook call, this field is set to the // value of the `source` field returned in the webhook response. string webhook_source = 8; // If the query was fulfilled by a webhook call, this field is set to the // value of the `payload` field returned in the webhook response. google.protobuf.Struct webhook_payload = 9; // The collection of output contexts. If applicable, // `output_contexts.parameters` contains entries with name // `.original` containing the original parameter values // before the query. repeated Context output_contexts = 10; // The intent that matched the conversational query. Some, not // all fields are filled in this message, including but not limited to: // `name`, `display_name` and `webhook_state`. Intent intent = 11; // The intent detection confidence. Values range from 0.0 // (completely uncertain) to 1.0 (completely certain). // If there are `multiple knowledge_answers` messages, this value is set to // the greatest `knowledgeAnswers.match_confidence` value in the list. float intent_detection_confidence = 12; // The free-form diagnostic info. For example, this field // could contain webhook call latency. google.protobuf.Struct diagnostic_info = 14; // The sentiment analysis result, which depends on the // `sentiment_analysis_request_config` specified in the request. SentimentAnalysisResult sentiment_analysis_result = 17; // The result from Knowledge Connector (if any), ordered by decreasing // `KnowledgeAnswers.match_confidence`. KnowledgeAnswers knowledge_answers = 18; } // Represents the result of querying a Knowledge base. message KnowledgeAnswers { // An answer from Knowledge Connector. message Answer { // Represents the system's confidence that this knowledge answer is a good // match for this conversational query. enum MatchConfidenceLevel { // Not specified. MATCH_CONFIDENCE_LEVEL_UNSPECIFIED = 0; // Indicates that the confidence is low. LOW = 1; // Indicates our confidence is medium. MEDIUM = 2; // Indicates our confidence is high. HIGH = 3; } // Indicates which Knowledge Document this answer was extracted from. // Format: `projects//knowledgeBases//documents/`. string source = 1; // The corresponding FAQ question if the answer was extracted from a FAQ // Document, empty otherwise. string faq_question = 2; // The piece of text from the `source` knowledge base document that answers // this conversational query. string answer = 3; // The system's confidence level that this knowledge answer is a good match // for this conversational query. // NOTE: The confidence level for a given `` pair may change // without notice, as it depends on models that are constantly being // improved. However, it will change less frequently than the confidence // score below, and should be preferred for referencing the quality of an // answer. MatchConfidenceLevel match_confidence_level = 4; // The system's confidence score that this Knowledge answer is a good match // for this converstational query, range from 0.0 (completely uncertain) // to 1.0 (completely certain). // Note: The confidence score is likely to vary somewhat (possibly even for // identical requests), as the underlying model is under constant // improvement, we may deprecate it in the future. We recommend using // `match_confidence_level` which should be generally more stable. float match_confidence = 5; } // A list of answers from Knowledge Connector. repeated Answer answers = 1; } // The top-level message sent by the client to the // `StreamingDetectIntent` method. // // Multiple request messages should be sent in order: // // 1. The first message must contain `session`, `query_input` plus optionally // `query_params` and/or `single_utterance`. If the client wants to receive // an audio response, it should also contain `output_audio_config`. // The message must not contain `input_audio`. // // 2. If `query_input` was set to a streaming input audio config, // all subsequent messages must contain only `input_audio`. // Otherwise, finish the request stream. message StreamingDetectIntentRequest { // Required. The name of the session the query is sent to. // Format of the session name: // `projects//agent/sessions/`, or // `projects//agent/environments//users//sessions/`. If `Environment ID` is not specified, we assume // default 'draft' environment. If `User ID` is not specified, we are using // "-". It’s up to the API caller to choose an appropriate `Session ID` and // `User Id`. They can be a random numbers or some type of user and session // identifiers (preferably hashed). The length of the `Session ID` and // `User ID` must not exceed 36 characters. string session = 1; // Optional. The parameters of this query. QueryParameters query_params = 2; // Required. The input specification. It can be set to: // // 1. an audio config which instructs the speech recognizer how to process // the speech audio, // // 2. a conversational query in the form of text, or // // 3. an event that specifies which intent to trigger. QueryInput query_input = 3; // Optional. If `false` (default), recognition does not cease until the // client closes the stream. // If `true`, the recognizer will detect a single spoken utterance in input // audio. Recognition ceases when it detects the audio's voice has // stopped or paused. In this case, once a detected intent is received, the // client should close the stream and start a new request with a new stream as // needed. // This setting is ignored when `query_input` is a piece of text or an event. bool single_utterance = 4; // Optional. Instructs the speech synthesizer how to generate the output // audio. If this field is not set and agent-level speech synthesizer is not // configured, no output audio is generated. OutputAudioConfig output_audio_config = 5; // Optional. The input audio content to be recognized. Must be sent if // `query_input` was set to a streaming input audio config. The complete audio // over all streaming messages must not exceed 1 minute. bytes input_audio = 6; } // The top-level message returned from the // `StreamingDetectIntent` method. // // Multiple response messages can be returned in order: // // 1. If the input was set to streaming audio, the first one or more messages // contain `recognition_result`. Each `recognition_result` represents a more // complete transcript of what the user said. The last `recognition_result` // has `is_final` set to `true`. // // 2. The next message contains `response_id`, `query_result`, // `alternative_query_results` and optionally `webhook_status` if a WebHook // was called. // // 3. If `output_audio_config` was specified in the request or agent-level // speech synthesizer is configured, all subsequent messages contain // `output_audio` and `output_audio_config`. message StreamingDetectIntentResponse { // The unique identifier of the response. It can be used to // locate a response in the training example set or for reporting issues. string response_id = 1; // The result of speech recognition. StreamingRecognitionResult recognition_result = 2; // The selected results of the conversational query or event processing. // See `alternative_query_results` for additional potential results. QueryResult query_result = 3; // If Knowledge Connectors are enabled, there could be more than one result // returned for a given query or event, and this field will contain all // results except for the top one, which is captured in query_result. The // alternative results are ordered by decreasing // `QueryResult.intent_detection_confidence`. If Knowledge Connectors are // disabled, this field will be empty until multiple responses for regular // intents are supported, at which point those additional results will be // surfaced here. repeated QueryResult alternative_query_results = 7; // Specifies the status of the webhook request. google.rpc.Status webhook_status = 4; // The audio data bytes encoded as specified in the request. bytes output_audio = 5; // Instructs the speech synthesizer how to generate the output audio. This // field is populated from the agent-level speech synthesizer configuration, // if enabled. OutputAudioConfig output_audio_config = 6; } // Contains a speech recognition result corresponding to a portion of the audio // that is currently being processed or an indication that this is the end // of the single requested utterance. // // Example: // // 1. transcript: "tube" // // 2. transcript: "to be a" // // 3. transcript: "to be" // // 4. transcript: "to be or not to be" // is_final: true // // 5. transcript: " that's" // // 6. transcript: " that is" // // 7. recognition_event_type: `RECOGNITION_EVENT_END_OF_SINGLE_UTTERANCE` // // 8. transcript: " that is the question" // is_final: true // // Only two of the responses contain final results (#4 and #8 indicated by // `is_final: true`). Concatenating these generates the full transcript: "to be // or not to be that is the question". // // In each response we populate: // // * for `MESSAGE_TYPE_TRANSCRIPT`: `transcript` and possibly `is_final`. // // * for `MESSAGE_TYPE_END_OF_SINGLE_UTTERANCE`: only `event_type`. message StreamingRecognitionResult { // Type of the response message. enum MessageType { // Not specified. Should never be used. MESSAGE_TYPE_UNSPECIFIED = 0; // Message contains a (possibly partial) transcript. TRANSCRIPT = 1; // Event indicates that the server has detected the end of the user's speech // utterance and expects no additional speech. Therefore, the server will // not process additional audio (although it may subsequently return // additional results). The client should stop sending additional audio // data, half-close the gRPC connection, and wait for any additional results // until the server closes the gRPC connection. This message is only sent if // `single_utterance` was set to `true`, and is not used otherwise. END_OF_SINGLE_UTTERANCE = 2; } // Type of the result message. MessageType message_type = 1; // Transcript text representing the words that the user spoke. // Populated if and only if `event_type` = `RECOGNITION_EVENT_TRANSCRIPT`. string transcript = 2; // The default of 0.0 is a sentinel value indicating `confidence` was not set. // If `false`, the `StreamingRecognitionResult` represents an // interim result that may change. If `true`, the recognizer will not return // any further hypotheses about this piece of the audio. May only be populated // for `event_type` = `RECOGNITION_EVENT_TRANSCRIPT`. bool is_final = 3; // The Speech confidence between 0.0 and 1.0 for the current portion of audio. // A higher number indicates an estimated greater likelihood that the // recognized words are correct. The default of 0.0 is a sentinel value // indicating that confidence was not set. // // This field is typically only provided if `is_final` is true and you should // not rely on it being accurate or even set. float confidence = 4; } // Instructs the speech recognizer how to process the audio content. message InputAudioConfig { // Required. Audio encoding of the audio content to process. AudioEncoding audio_encoding = 1; // Required. Sample rate (in Hertz) of the audio content sent in the query. // Refer to // [Cloud Speech API // documentation](https://cloud.google.com/speech-to-text/docs/basics) for // more details. int32 sample_rate_hertz = 2; // Required. The language of the supplied audio. Dialogflow does not do // translations. See [Language // Support](https://dialogflow.com/docs/languages) for a list of the // currently supported language codes. Note that queries in the same session // do not necessarily need to specify the same language. string language_code = 3; // Optional. The collection of phrase hints which are used to boost accuracy // of speech recognition. // Refer to // [Cloud Speech API // documentation](https://cloud.google.com/speech-to-text/docs/basics#phrase-hints) // for more details. repeated string phrase_hints = 4; // Optional. Which Speech model to select for the given request. Select the // model best suited to your domain to get best results. If a model is not // explicitly specified, then we auto-select a model based on the parameters // in the InputAudioConfig. // If enhanced speech model is enabled for the agent and an enhanced // version of the specified model for the language does not exist, then the // speech is recognized using the standard version of the specified model. // Refer to // [Cloud Speech API // documentation](https://cloud.google.com/speech-to-text/docs/basics#select-model) // for more details. string model = 7; } // Represents the natural language text to be processed. message TextInput { // Required. The UTF-8 encoded natural language text to be processed. // Text length must not exceed 256 bytes. string text = 1; // Required. The language of this conversational query. See [Language // Support](https://dialogflow.com/docs/languages) for a list of the // currently supported language codes. Note that queries in the same session // do not necessarily need to specify the same language. string language_code = 2; } // Events allow for matching intents by event name instead of the natural // language input. For instance, input `` can trigger a personalized welcome response. // The parameter `name` may be used by the agent in the response: // `“Hello #welcome_event.name! What can I do for you today?”`. message EventInput { // Required. The unique identifier of the event. string name = 1; // Optional. The collection of parameters associated with the event. google.protobuf.Struct parameters = 2; // Required. The language of this query. See [Language // Support](https://dialogflow.com/docs/languages) for a list of the // currently supported language codes. Note that queries in the same session // do not necessarily need to specify the same language. string language_code = 3; } // Configures the types of sentiment analysis to perform. message SentimentAnalysisRequestConfig { // Optional. Instructs the service to perform sentiment analysis on // `query_text`. If not provided, sentiment analysis is not performed on // `query_text`. bool analyze_query_text_sentiment = 1; } // The result of sentiment analysis as configured by // `sentiment_analysis_request_config`. message SentimentAnalysisResult { // The sentiment analysis result for `query_text`. Sentiment query_text_sentiment = 1; } // The sentiment, such as positive/negative feeling or association, for a unit // of analysis, such as the query text. message Sentiment { // Sentiment score between -1.0 (negative sentiment) and 1.0 (positive // sentiment). float score = 1; // A non-negative number in the [0, +inf) range, which represents the absolute // magnitude of sentiment, regardless of score (positive or negative). float magnitude = 2; } // Audio encoding of the audio content sent in the conversational query request. // Refer to the // [Cloud Speech API // documentation](https://cloud.google.com/speech-to-text/docs/basics) for more // details. enum AudioEncoding { // Not specified. AUDIO_ENCODING_UNSPECIFIED = 0; // Uncompressed 16-bit signed little-endian samples (Linear PCM). AUDIO_ENCODING_LINEAR_16 = 1; // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio // Codec) is the recommended encoding because it is lossless (therefore // recognition is not compromised) and requires only about half the // bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and // 24-bit samples, however, not all fields in `STREAMINFO` are supported. AUDIO_ENCODING_FLAC = 2; // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. AUDIO_ENCODING_MULAW = 3; // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. AUDIO_ENCODING_AMR = 4; // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. AUDIO_ENCODING_AMR_WB = 5; // Opus encoded audio frames in Ogg container // ([OggOpus](https://wiki.xiph.org/OggOpus)). // `sample_rate_hertz` must be 16000. AUDIO_ENCODING_OGG_OPUS = 6; // Although the use of lossy encodings is not recommended, if a very low // bitrate encoding is required, `OGG_OPUS` is highly preferred over // Speex encoding. The [Speex](https://speex.org/) encoding supported by // Dialogflow API has a header byte in each block, as in MIME type // `audio/x-speex-with-header-byte`. // It is a variant of the RTP Speex encoding defined in // [RFC 5574](https://tools.ietf.org/html/rfc5574). // The stream is a sequence of blocks, one block per RTP packet. Each block // starts with a byte containing the length of the block, in bytes, followed // by one or more frames of Speex data, padded to an integral number of // bytes (octets) as specified in RFC 5574. In other words, each RTP header // is replaced with a single byte containing the block length. Only Speex // wideband is supported. `sample_rate_hertz` must be 16000. AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7; }