// Copyright 2018 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // syntax = "proto3"; package google.cloud.dataproc.v1; import "google/api/annotations.proto"; import "google/protobuf/empty.proto"; import "google/protobuf/field_mask.proto"; import "google/protobuf/timestamp.proto"; option go_package = "google.golang.org/genproto/googleapis/cloud/dataproc/v1;dataproc"; option java_multiple_files = true; option java_outer_classname = "JobsProto"; option java_package = "com.google.cloud.dataproc.v1"; // The JobController provides methods to manage jobs. service JobController { // Submits a job to a cluster. rpc SubmitJob(SubmitJobRequest) returns (Job) { option (google.api.http) = { post: "/v1/projects/{project_id}/regions/{region}/jobs:submit" body: "*" }; } // Gets the resource representation for a job in a project. rpc GetJob(GetJobRequest) returns (Job) { option (google.api.http) = { get: "/v1/projects/{project_id}/regions/{region}/jobs/{job_id}" }; } // Lists regions/{region}/jobs in a project. rpc ListJobs(ListJobsRequest) returns (ListJobsResponse) { option (google.api.http) = { get: "/v1/projects/{project_id}/regions/{region}/jobs" }; } // Updates a job in a project. rpc UpdateJob(UpdateJobRequest) returns (Job) { option (google.api.http) = { patch: "/v1/projects/{project_id}/regions/{region}/jobs/{job_id}" body: "job" }; } // Starts a job cancellation request. To access the job resource // after cancellation, call // [regions/{region}/jobs.list](/dataproc/docs/reference/rest/v1/projects.regions.jobs/list) // or // [regions/{region}/jobs.get](/dataproc/docs/reference/rest/v1/projects.regions.jobs/get). rpc CancelJob(CancelJobRequest) returns (Job) { option (google.api.http) = { post: "/v1/projects/{project_id}/regions/{region}/jobs/{job_id}:cancel" body: "*" }; } // Deletes the job from the project. If the job is active, the delete fails, // and the response returns `FAILED_PRECONDITION`. rpc DeleteJob(DeleteJobRequest) returns (google.protobuf.Empty) { option (google.api.http) = { delete: "/v1/projects/{project_id}/regions/{region}/jobs/{job_id}" }; } } // The runtime logging config of the job. message LoggingConfig { // The Log4j level for job execution. When running an // [Apache Hive](http://hive.apache.org/) job, Cloud // Dataproc configures the Hive client to an equivalent verbosity level. enum Level { // Level is unspecified. Use default level for log4j. LEVEL_UNSPECIFIED = 0; // Use ALL level for log4j. ALL = 1; // Use TRACE level for log4j. TRACE = 2; // Use DEBUG level for log4j. DEBUG = 3; // Use INFO level for log4j. INFO = 4; // Use WARN level for log4j. WARN = 5; // Use ERROR level for log4j. ERROR = 6; // Use FATAL level for log4j. FATAL = 7; // Turn off log4j. OFF = 8; } // The per-package log levels for the driver. This may include // "root" package name to configure rootLogger. // Examples: // 'com.google = FATAL', 'root = INFO', 'org.apache = DEBUG' map driver_log_levels = 2; } // A Cloud Dataproc job for running // [Apache Hadoop // MapReduce](https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html) // jobs on [Apache Hadoop // YARN](https://hadoop.apache.org/docs/r2.7.1/hadoop-yarn/hadoop-yarn-site/YARN.html). message HadoopJob { // Required. Indicates the location of the driver's main class. Specify // either the jar file that contains the main class or the main class name. // To specify both, add the jar file to `jar_file_uris`, and then specify // the main class name in this property. oneof driver { // The HCFS URI of the jar file containing the main class. // Examples: // 'gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar' // 'hdfs:/tmp/test-samples/custom-wordcount.jar' // 'file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar' string main_jar_file_uri = 1; // The name of the driver's main class. The jar file containing the class // must be in the default CLASSPATH or specified in `jar_file_uris`. string main_class = 2; } // Optional. The arguments to pass to the driver. Do not // include arguments, such as `-libjars` or `-Dfoo=bar`, that can be set as // job properties, since a collision may occur that causes an incorrect job // submission. repeated string args = 3; // Optional. Jar file URIs to add to the CLASSPATHs of the // Hadoop driver and tasks. repeated string jar_file_uris = 4; // Optional. HCFS (Hadoop Compatible Filesystem) URIs of files to be copied // to the working directory of Hadoop drivers and distributed tasks. Useful // for naively parallel tasks. repeated string file_uris = 5; // Optional. HCFS URIs of archives to be extracted in the working directory of // Hadoop drivers and tasks. Supported file types: // .jar, .tar, .tar.gz, .tgz, or .zip. repeated string archive_uris = 6; // Optional. A mapping of property names to values, used to configure Hadoop. // Properties that conflict with values set by the Cloud Dataproc API may be // overwritten. Can include properties set in /etc/hadoop/conf/*-site and // classes in user code. map properties = 7; // Optional. The runtime log config for job execution. LoggingConfig logging_config = 8; } // A Cloud Dataproc job for running [Apache Spark](http://spark.apache.org/) // applications on YARN. message SparkJob { // Required. The specification of the main method to call to drive the job. // Specify either the jar file that contains the main class or the main class // name. To pass both a main jar and a main class in that jar, add the jar to // `CommonJob.jar_file_uris`, and then specify the main class name in // `main_class`. oneof driver { // The HCFS URI of the jar file that contains the main class. string main_jar_file_uri = 1; // The name of the driver's main class. The jar file that contains the class // must be in the default CLASSPATH or specified in `jar_file_uris`. string main_class = 2; } // Optional. The arguments to pass to the driver. Do not include arguments, // such as `--conf`, that can be set as job properties, since a collision may // occur that causes an incorrect job submission. repeated string args = 3; // Optional. HCFS URIs of jar files to add to the CLASSPATHs of the // Spark driver and tasks. repeated string jar_file_uris = 4; // Optional. HCFS URIs of files to be copied to the working directory of // Spark drivers and distributed tasks. Useful for naively parallel tasks. repeated string file_uris = 5; // Optional. HCFS URIs of archives to be extracted in the working directory // of Spark drivers and tasks. Supported file types: // .jar, .tar, .tar.gz, .tgz, and .zip. repeated string archive_uris = 6; // Optional. A mapping of property names to values, used to configure Spark. // Properties that conflict with values set by the Cloud Dataproc API may be // overwritten. Can include properties set in // /etc/spark/conf/spark-defaults.conf and classes in user code. map properties = 7; // Optional. The runtime log config for job execution. LoggingConfig logging_config = 8; } // A Cloud Dataproc job for running // [Apache // PySpark](https://spark.apache.org/docs/0.9.0/python-programming-guide.html) // applications on YARN. message PySparkJob { // Required. The HCFS URI of the main Python file to use as the driver. Must // be a .py file. string main_python_file_uri = 1; // Optional. The arguments to pass to the driver. Do not include arguments, // such as `--conf`, that can be set as job properties, since a collision may // occur that causes an incorrect job submission. repeated string args = 2; // Optional. HCFS file URIs of Python files to pass to the PySpark // framework. Supported file types: .py, .egg, and .zip. repeated string python_file_uris = 3; // Optional. HCFS URIs of jar files to add to the CLASSPATHs of the // Python driver and tasks. repeated string jar_file_uris = 4; // Optional. HCFS URIs of files to be copied to the working directory of // Python drivers and distributed tasks. Useful for naively parallel tasks. repeated string file_uris = 5; // Optional. HCFS URIs of archives to be extracted in the working directory of // .jar, .tar, .tar.gz, .tgz, and .zip. repeated string archive_uris = 6; // Optional. A mapping of property names to values, used to configure PySpark. // Properties that conflict with values set by the Cloud Dataproc API may be // overwritten. Can include properties set in // /etc/spark/conf/spark-defaults.conf and classes in user code. map properties = 7; // Optional. The runtime log config for job execution. LoggingConfig logging_config = 8; } // A list of queries to run on a cluster. message QueryList { // Required. The queries to execute. You do not need to terminate a query // with a semicolon. Multiple queries can be specified in one string // by separating each with a semicolon. Here is an example of an Cloud // Dataproc API snippet that uses a QueryList to specify a HiveJob: // // "hiveJob": { // "queryList": { // "queries": [ // "query1", // "query2", // "query3;query4", // ] // } // } repeated string queries = 1; } // A Cloud Dataproc job for running [Apache Hive](https://hive.apache.org/) // queries on YARN. message HiveJob { // Required. The sequence of Hive queries to execute, specified as either // an HCFS file URI or a list of queries. oneof queries { // The HCFS URI of the script that contains Hive queries. string query_file_uri = 1; // A list of queries. QueryList query_list = 2; } // Optional. Whether to continue executing queries if a query fails. // The default value is `false`. Setting to `true` can be useful when // executing independent parallel queries. bool continue_on_failure = 3; // Optional. Mapping of query variable names to values (equivalent to the // Hive command: `SET name="value";`). map script_variables = 4; // Optional. A mapping of property names and values, used to configure Hive. // Properties that conflict with values set by the Cloud Dataproc API may be // overwritten. Can include properties set in /etc/hadoop/conf/*-site.xml, // /etc/hive/conf/hive-site.xml, and classes in user code. map properties = 5; // Optional. HCFS URIs of jar files to add to the CLASSPATH of the // Hive server and Hadoop MapReduce (MR) tasks. Can contain Hive SerDes // and UDFs. repeated string jar_file_uris = 6; } // A Cloud Dataproc job for running [Apache Spark // SQL](http://spark.apache.org/sql/) queries. message SparkSqlJob { // Required. The sequence of Spark SQL queries to execute, specified as // either an HCFS file URI or as a list of queries. oneof queries { // The HCFS URI of the script that contains SQL queries. string query_file_uri = 1; // A list of queries. QueryList query_list = 2; } // Optional. Mapping of query variable names to values (equivalent to the // Spark SQL command: SET `name="value";`). map script_variables = 3; // Optional. A mapping of property names to values, used to configure // Spark SQL's SparkConf. Properties that conflict with values set by the // Cloud Dataproc API may be overwritten. map properties = 4; // Optional. HCFS URIs of jar files to be added to the Spark CLASSPATH. repeated string jar_file_uris = 56; // Optional. The runtime log config for job execution. LoggingConfig logging_config = 6; } // A Cloud Dataproc job for running [Apache Pig](https://pig.apache.org/) // queries on YARN. message PigJob { // Required. The sequence of Pig queries to execute, specified as an HCFS // file URI or a list of queries. oneof queries { // The HCFS URI of the script that contains the Pig queries. string query_file_uri = 1; // A list of queries. QueryList query_list = 2; } // Optional. Whether to continue executing queries if a query fails. // The default value is `false`. Setting to `true` can be useful when // executing independent parallel queries. bool continue_on_failure = 3; // Optional. Mapping of query variable names to values (equivalent to the Pig // command: `name=[value]`). map script_variables = 4; // Optional. A mapping of property names to values, used to configure Pig. // Properties that conflict with values set by the Cloud Dataproc API may be // overwritten. Can include properties set in /etc/hadoop/conf/*-site.xml, // /etc/pig/conf/pig.properties, and classes in user code. map properties = 5; // Optional. HCFS URIs of jar files to add to the CLASSPATH of // the Pig Client and Hadoop MapReduce (MR) tasks. Can contain Pig UDFs. repeated string jar_file_uris = 6; // Optional. The runtime log config for job execution. LoggingConfig logging_config = 7; } // Cloud Dataproc job config. message JobPlacement { // Required. The name of the cluster where the job will be submitted. string cluster_name = 1; // Output only. A cluster UUID generated by the Cloud Dataproc service when // the job is submitted. string cluster_uuid = 2; } // Cloud Dataproc job status. message JobStatus { // The job state. enum State { // The job state is unknown. STATE_UNSPECIFIED = 0; // The job is pending; it has been submitted, but is not yet running. PENDING = 1; // Job has been received by the service and completed initial setup; // it will soon be submitted to the cluster. SETUP_DONE = 8; // The job is running on the cluster. RUNNING = 2; // A CancelJob request has been received, but is pending. CANCEL_PENDING = 3; // Transient in-flight resources have been canceled, and the request to // cancel the running job has been issued to the cluster. CANCEL_STARTED = 7; // The job cancellation was successful. CANCELLED = 4; // The job has completed successfully. DONE = 5; // The job has completed, but encountered an error. ERROR = 6; // Job attempt has failed. The detail field contains failure details for // this attempt. // // Applies to restartable jobs only. ATTEMPT_FAILURE = 9; } // The job substate. enum Substate { // The job substate is unknown. UNSPECIFIED = 0; // The Job is submitted to the agent. // // Applies to RUNNING state. SUBMITTED = 1; // The Job has been received and is awaiting execution (it may be waiting // for a condition to be met). See the "details" field for the reason for // the delay. // // Applies to RUNNING state. QUEUED = 2; // The agent-reported status is out of date, which may be caused by a // loss of communication between the agent and Cloud Dataproc. If the // agent does not send a timely update, the job will fail. // // Applies to RUNNING state. STALE_STATUS = 3; } // Output only. A state message specifying the overall job state. State state = 1; // Output only. Optional job state details, such as an error // description if the state is ERROR. string details = 2; // Output only. The time when this state was entered. google.protobuf.Timestamp state_start_time = 6; // Output only. Additional state information, which includes // status reported by the agent. Substate substate = 7; } // Encapsulates the full scoping used to reference a job. message JobReference { // Required. The ID of the Google Cloud Platform project that the job // belongs to. string project_id = 1; // Optional. The job ID, which must be unique within the project. The job ID // is generated by the server upon job submission or provided by the user as a // means to perform retries without creating duplicate jobs. The ID must // contain only letters (a-z, A-Z), numbers (0-9), underscores (_), or // hyphens (-). The maximum length is 100 characters. string job_id = 2; } // A YARN application created by a job. Application information is a subset of // org.apache.hadoop.yarn.proto.YarnProtos.ApplicationReportProto. // // **Beta Feature**: This report is available for testing purposes only. It may // be changed before final release. message YarnApplication { // The application state, corresponding to // YarnProtos.YarnApplicationStateProto. enum State { // Status is unspecified. STATE_UNSPECIFIED = 0; // Status is NEW. NEW = 1; // Status is NEW_SAVING. NEW_SAVING = 2; // Status is SUBMITTED. SUBMITTED = 3; // Status is ACCEPTED. ACCEPTED = 4; // Status is RUNNING. RUNNING = 5; // Status is FINISHED. FINISHED = 6; // Status is FAILED. FAILED = 7; // Status is KILLED. KILLED = 8; } // Required. The application name. string name = 1; // Required. The application state. State state = 2; // Required. The numerical progress of the application, from 1 to 100. float progress = 3; // Optional. The HTTP URL of the ApplicationMaster, HistoryServer, or // TimelineServer that provides application-specific information. The URL uses // the internal hostname, and requires a proxy server for resolution and, // possibly, access. string tracking_url = 4; } // A Cloud Dataproc job resource. message Job { // Optional. The fully qualified reference to the job, which can be used to // obtain the equivalent REST path of the job resource. If this property // is not specified when a job is created, the server generates a // job_id. JobReference reference = 1; // Required. Job information, including how, when, and where to // run the job. JobPlacement placement = 2; // Required. The application/framework-specific portion of the job. oneof type_job { // Job is a Hadoop job. HadoopJob hadoop_job = 3; // Job is a Spark job. SparkJob spark_job = 4; // Job is a Pyspark job. PySparkJob pyspark_job = 5; // Job is a Hive job. HiveJob hive_job = 6; // Job is a Pig job. PigJob pig_job = 7; // Job is a SparkSql job. SparkSqlJob spark_sql_job = 12; } // Output only. The job status. Additional application-specific // status information may be contained in the type_job // and yarn_applications fields. JobStatus status = 8; // Output only. The previous job status. repeated JobStatus status_history = 13; // Output only. The collection of YARN applications spun up by this job. // // **Beta** Feature: This report is available for testing purposes only. It // may be changed before final release. repeated YarnApplication yarn_applications = 9; // Output only. A URI pointing to the location of the stdout of the job's // driver program. string driver_output_resource_uri = 17; // Output only. If present, the location of miscellaneous control files // which may be used as part of job setup and handling. If not present, // control files may be placed in the same location as `driver_output_uri`. string driver_control_files_uri = 15; // Optional. The labels to associate with this job. // Label **keys** must contain 1 to 63 characters, and must conform to // [RFC 1035](https://www.ietf.org/rfc/rfc1035.txt). // Label **values** may be empty, but, if present, must contain 1 to 63 // characters, and must conform to [RFC // 1035](https://www.ietf.org/rfc/rfc1035.txt). No more than 32 labels can be // associated with a job. map labels = 18; // Optional. Job scheduling configuration. JobScheduling scheduling = 20; // Output only. A UUID that uniquely identifies a job within the project // over time. This is in contrast to a user-settable reference.job_id that // may be reused over time. string job_uuid = 22; } // Job scheduling options. message JobScheduling { // Optional. Maximum number of times per hour a driver may be restarted as // a result of driver terminating with non-zero code before job is // reported failed. // // A job may be reported as thrashing if driver exits with non-zero code // 4 times within 10 minute window. // // Maximum value is 10. int32 max_failures_per_hour = 1; } // A request to submit a job. message SubmitJobRequest { // Required. The ID of the Google Cloud Platform project that the job // belongs to. string project_id = 1; // Required. The Cloud Dataproc region in which to handle the request. string region = 3; // Required. The job resource. Job job = 2; // Optional. A unique id used to identify the request. If the server // receives two [SubmitJobRequest][google.cloud.dataproc.v1.SubmitJobRequest] // requests with the same id, then the second request will be ignored and the // first [Job][google.cloud.dataproc.v1.Job] created and stored in the backend // is returned. // // It is recommended to always set this value to a // [UUID](https://en.wikipedia.org/wiki/Universally_unique_identifier). // // The id must contain only letters (a-z, A-Z), numbers (0-9), // underscores (_), and hyphens (-). The maximum length is 40 characters. string request_id = 4; } // A request to get the resource representation for a job in a project. message GetJobRequest { // Required. The ID of the Google Cloud Platform project that the job // belongs to. string project_id = 1; // Required. The Cloud Dataproc region in which to handle the request. string region = 3; // Required. The job ID. string job_id = 2; } // A request to list jobs in a project. message ListJobsRequest { // A matcher that specifies categories of job states. enum JobStateMatcher { // Match all jobs, regardless of state. ALL = 0; // Only match jobs in non-terminal states: PENDING, RUNNING, or // CANCEL_PENDING. ACTIVE = 1; // Only match jobs in terminal states: CANCELLED, DONE, or ERROR. NON_ACTIVE = 2; } // Required. The ID of the Google Cloud Platform project that the job // belongs to. string project_id = 1; // Required. The Cloud Dataproc region in which to handle the request. string region = 6; // Optional. The number of results to return in each response. int32 page_size = 2; // Optional. The page token, returned by a previous call, to request the // next page of results. string page_token = 3; // Optional. If set, the returned jobs list includes only jobs that were // submitted to the named cluster. string cluster_name = 4; // Optional. Specifies enumerated categories of jobs to list. // (default = match ALL jobs). // // If `filter` is provided, `jobStateMatcher` will be ignored. JobStateMatcher job_state_matcher = 5; // Optional. A filter constraining the jobs to list. Filters are // case-sensitive and have the following syntax: // // [field = value] AND [field [= value]] ... // // where **field** is `status.state` or `labels.[KEY]`, and `[KEY]` is a label // key. **value** can be `*` to match all values. // `status.state` can be either `ACTIVE` or `NON_ACTIVE`. // Only the logical `AND` operator is supported; space-separated items are // treated as having an implicit `AND` operator. // // Example filter: // // status.state = ACTIVE AND labels.env = staging AND labels.starred = * string filter = 7; } // A request to update a job. message UpdateJobRequest { // Required. The ID of the Google Cloud Platform project that the job // belongs to. string project_id = 1; // Required. The Cloud Dataproc region in which to handle the request. string region = 2; // Required. The job ID. string job_id = 3; // Required. The changes to the job. Job job = 4; // Required. Specifies the path, relative to Job, of // the field to update. For example, to update the labels of a Job the // update_mask parameter would be specified as // labels, and the `PATCH` request body would specify the new // value. Note: Currently, labels is the only // field that can be updated. google.protobuf.FieldMask update_mask = 5; } // A list of jobs in a project. message ListJobsResponse { // Output only. Jobs list. repeated Job jobs = 1; // Optional. This token is included in the response if there are more results // to fetch. To fetch additional results, provide this value as the // `page_token` in a subsequent ListJobsRequest. string next_page_token = 2; } // A request to cancel a job. message CancelJobRequest { // Required. The ID of the Google Cloud Platform project that the job // belongs to. string project_id = 1; // Required. The Cloud Dataproc region in which to handle the request. string region = 3; // Required. The job ID. string job_id = 2; } // A request to delete a job. message DeleteJobRequest { // Required. The ID of the Google Cloud Platform project that the job // belongs to. string project_id = 1; // Required. The Cloud Dataproc region in which to handle the request. string region = 3; // Required. The job ID. string job_id = 2; }