You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
513 lines
17 KiB
513 lines
17 KiB
// Copyright 2019 Google LLC.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
syntax = "proto3";
|
|
|
|
package google.cloud.bigquery.v2;
|
|
|
|
import "google/cloud/bigquery/v2/model_reference.proto";
|
|
import "google/cloud/bigquery/v2/standard_sql.proto";
|
|
import "google/protobuf/empty.proto";
|
|
import "google/protobuf/timestamp.proto";
|
|
import "google/protobuf/wrappers.proto";
|
|
import "google/api/annotations.proto";
|
|
|
|
option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery";
|
|
option java_outer_classname = "ModelProto";
|
|
option java_package = "com.google.cloud.bigquery.v2";
|
|
|
|
|
|
service ModelService {
|
|
// Gets the specified model resource by model ID.
|
|
rpc GetModel(GetModelRequest) returns (Model) {
|
|
}
|
|
|
|
// Lists all models in the specified dataset. Requires the READER dataset
|
|
// role.
|
|
rpc ListModels(ListModelsRequest) returns (ListModelsResponse) {
|
|
}
|
|
|
|
// Patch specific fields in the specified model.
|
|
rpc PatchModel(PatchModelRequest) returns (Model) {
|
|
}
|
|
|
|
// Deletes the model specified by modelId from the dataset.
|
|
rpc DeleteModel(DeleteModelRequest) returns (google.protobuf.Empty) {
|
|
}
|
|
}
|
|
|
|
message Model {
|
|
// Evaluation metrics for regression models.
|
|
message RegressionMetrics {
|
|
// Mean absolute error.
|
|
google.protobuf.DoubleValue mean_absolute_error = 1;
|
|
|
|
// Mean squared error.
|
|
google.protobuf.DoubleValue mean_squared_error = 2;
|
|
|
|
// Mean squared log error.
|
|
google.protobuf.DoubleValue mean_squared_log_error = 3;
|
|
|
|
// Median absolute error.
|
|
google.protobuf.DoubleValue median_absolute_error = 4;
|
|
|
|
// R^2 score.
|
|
google.protobuf.DoubleValue r_squared = 5;
|
|
}
|
|
|
|
// Aggregate metrics for classification models. For multi-class models,
|
|
// the metrics are either macro-averaged: metrics are calculated for each
|
|
// label and then an unweighted average is taken of those values or
|
|
// micro-averaged: the metric is calculated globally by counting the total
|
|
// number of correctly predicted rows.
|
|
message AggregateClassificationMetrics {
|
|
// Precision is the fraction of actual positive predictions that had
|
|
// positive actual labels. For multiclass this is a macro-averaged
|
|
// metric treating each class as a binary classifier.
|
|
google.protobuf.DoubleValue precision = 1;
|
|
|
|
// Recall is the fraction of actual positive labels that were given a
|
|
// positive prediction. For multiclass this is a macro-averaged metric.
|
|
google.protobuf.DoubleValue recall = 2;
|
|
|
|
// Accuracy is the fraction of predictions given the correct label. For
|
|
// multiclass this is a micro-averaged metric.
|
|
google.protobuf.DoubleValue accuracy = 3;
|
|
|
|
// Threshold at which the metrics are computed. For binary
|
|
// classification models this is the positive class threshold.
|
|
// For multi-class classfication models this is the confidence
|
|
// threshold.
|
|
google.protobuf.DoubleValue threshold = 4;
|
|
|
|
// The F1 score is an average of recall and precision. For multiclass
|
|
// this is a macro-averaged metric.
|
|
google.protobuf.DoubleValue f1_score = 5;
|
|
|
|
// Logarithmic Loss. For multiclass this is a macro-averaged metric.
|
|
google.protobuf.DoubleValue log_loss = 6;
|
|
|
|
// Area Under a ROC Curve. For multiclass this is a macro-averaged
|
|
// metric.
|
|
google.protobuf.DoubleValue roc_auc = 7;
|
|
}
|
|
|
|
// Evaluation metrics for binary classification models.
|
|
message BinaryClassificationMetrics {
|
|
// Confusion matrix for binary classification models.
|
|
message BinaryConfusionMatrix {
|
|
// Threshold value used when computing each of the following metric.
|
|
google.protobuf.DoubleValue positive_class_threshold = 1;
|
|
|
|
// Number of true samples predicted as true.
|
|
google.protobuf.Int64Value true_positives = 2;
|
|
|
|
// Number of false samples predicted as true.
|
|
google.protobuf.Int64Value false_positives = 3;
|
|
|
|
// Number of true samples predicted as false.
|
|
google.protobuf.Int64Value true_negatives = 4;
|
|
|
|
// Number of false samples predicted as false.
|
|
google.protobuf.Int64Value false_negatives = 5;
|
|
|
|
// Aggregate precision.
|
|
google.protobuf.DoubleValue precision = 6;
|
|
|
|
// Aggregate recall.
|
|
google.protobuf.DoubleValue recall = 7;
|
|
}
|
|
|
|
// Aggregate classification metrics.
|
|
AggregateClassificationMetrics aggregate_classification_metrics = 1;
|
|
|
|
// Binary confusion matrix at multiple thresholds.
|
|
repeated BinaryConfusionMatrix binary_confusion_matrix_list = 2;
|
|
}
|
|
|
|
// Evaluation metrics for multi-class classification models.
|
|
message MultiClassClassificationMetrics {
|
|
// Confusion matrix for multi-class classification models.
|
|
message ConfusionMatrix {
|
|
// A single entry in the confusion matrix.
|
|
message Entry {
|
|
// The predicted label. For confidence_threshold > 0, we will
|
|
// also add an entry indicating the number of items under the
|
|
// confidence threshold.
|
|
string predicted_label = 1;
|
|
|
|
// Number of items being predicted as this label.
|
|
google.protobuf.Int64Value item_count = 2;
|
|
}
|
|
|
|
// A single row in the confusion matrix.
|
|
message Row {
|
|
// The original label of this row.
|
|
string actual_label = 1;
|
|
|
|
// Info describing predicted label distribution.
|
|
repeated Entry entries = 2;
|
|
}
|
|
|
|
// Confidence threshold used when computing the entries of the
|
|
// confusion matrix.
|
|
google.protobuf.DoubleValue confidence_threshold = 1;
|
|
|
|
// One row per actual label.
|
|
repeated Row rows = 2;
|
|
}
|
|
|
|
// Aggregate classification metrics.
|
|
AggregateClassificationMetrics aggregate_classification_metrics = 1;
|
|
|
|
// Confusion matrix at different thresholds.
|
|
repeated ConfusionMatrix confusion_matrix_list = 2;
|
|
}
|
|
|
|
// Evaluation metrics for clustering models.
|
|
message ClusteringMetrics {
|
|
// Davies-Bouldin index.
|
|
google.protobuf.DoubleValue davies_bouldin_index = 1;
|
|
|
|
// Mean of squared distances between each sample to its cluster centroid.
|
|
google.protobuf.DoubleValue mean_squared_distance = 2;
|
|
}
|
|
|
|
// Evaluation metrics of a model. These are either computed on all
|
|
// training data or just the eval data based on whether eval data was used
|
|
// during training.
|
|
message EvaluationMetrics {
|
|
oneof metrics {
|
|
// Populated for regression models.
|
|
RegressionMetrics regression_metrics = 1;
|
|
|
|
// Populated for binary classification models.
|
|
BinaryClassificationMetrics binary_classification_metrics = 2;
|
|
|
|
// Populated for multi-class classification models.
|
|
MultiClassClassificationMetrics multi_class_classification_metrics = 3;
|
|
|
|
// [Beta] Populated for clustering models.
|
|
ClusteringMetrics clustering_metrics = 4;
|
|
}
|
|
}
|
|
|
|
// Information about a single training query run for the model.
|
|
message TrainingRun {
|
|
message TrainingOptions {
|
|
// The maximum number of iterations in training.
|
|
int64 max_iterations = 1;
|
|
|
|
// Type of loss function used during training run.
|
|
LossType loss_type = 2;
|
|
|
|
// Learning rate in training.
|
|
double learn_rate = 3;
|
|
|
|
// L1 regularization coefficient.
|
|
google.protobuf.DoubleValue l1_regularization = 4;
|
|
|
|
// L2 regularization coefficient.
|
|
google.protobuf.DoubleValue l2_regularization = 5;
|
|
|
|
// When early_stop is true, stops training when accuracy improvement is
|
|
// less than 'min_relative_progress'.
|
|
google.protobuf.DoubleValue min_relative_progress = 6;
|
|
|
|
// Whether to train a model from the last checkpoint.
|
|
google.protobuf.BoolValue warm_start = 7;
|
|
|
|
// Whether to stop early when the loss doesn't improve significantly
|
|
// any more (compared to min_relative_progress).
|
|
google.protobuf.BoolValue early_stop = 8;
|
|
|
|
// Name of input label columns in training data.
|
|
repeated string input_label_columns = 9;
|
|
|
|
// The data split type for training and evaluation, e.g. RANDOM.
|
|
DataSplitMethod data_split_method = 10;
|
|
|
|
// The fraction of evaluation data over the whole input data. The rest
|
|
// of data will be used as training data. The format should be double.
|
|
// Accurate to two decimal places.
|
|
// Default value is 0.2.
|
|
double data_split_eval_fraction = 11;
|
|
|
|
// The column to split data with. This column won't be used as a
|
|
// feature.
|
|
// 1. When data_split_method is CUSTOM, the corresponding column should
|
|
// be boolean. The rows with true value tag are eval data, and the false
|
|
// are training data.
|
|
// 2. When data_split_method is SEQ, the first DATA_SPLIT_EVAL_FRACTION
|
|
// rows (from smallest to largest) in the corresponding column are used
|
|
// as training data, and the rest are eval data. It respects the order
|
|
// in Orderable data types:
|
|
// https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data-type-properties
|
|
string data_split_column = 12;
|
|
|
|
// The strategy to determine learning rate.
|
|
LearnRateStrategy learn_rate_strategy = 13;
|
|
|
|
// Specifies the initial learning rate for line search to start at.
|
|
double initial_learn_rate = 16;
|
|
|
|
// Weights associated with each label class, for rebalancing the
|
|
// training data.
|
|
map<string, double> label_class_weights = 17;
|
|
|
|
// [Beta] Distance type for clustering models.
|
|
DistanceType distance_type = 20;
|
|
|
|
// [Beta] Number of clusters for clustering models.
|
|
int64 num_clusters = 21;
|
|
}
|
|
|
|
// Information about a single iteration of the training run.
|
|
message IterationResult {
|
|
// Information about a single cluster for clustering model.
|
|
message ClusterInfo {
|
|
// Centroid id.
|
|
int64 centroid_id = 1;
|
|
|
|
// Cluster radius, the average distance from centroid
|
|
// to each point assigned to the cluster.
|
|
google.protobuf.DoubleValue cluster_radius = 2;
|
|
|
|
// Cluster size, the total number of points assigned to the cluster.
|
|
google.protobuf.Int64Value cluster_size = 3;
|
|
}
|
|
|
|
// Index of the iteration, 0 based.
|
|
google.protobuf.Int32Value index = 1;
|
|
|
|
// Time taken to run the iteration in milliseconds.
|
|
google.protobuf.Int64Value duration_ms = 4;
|
|
|
|
// Loss computed on the training data at the end of iteration.
|
|
google.protobuf.DoubleValue training_loss = 5;
|
|
|
|
// Loss computed on the eval data at the end of iteration.
|
|
google.protobuf.DoubleValue eval_loss = 6;
|
|
|
|
// Learn rate used for this iteration.
|
|
double learn_rate = 7;
|
|
|
|
// [Beta] Information about top clusters for clustering models.
|
|
repeated ClusterInfo cluster_infos = 8;
|
|
}
|
|
|
|
// Options that were used for this training run, includes
|
|
// user specified and default options that were used.
|
|
TrainingOptions training_options = 1;
|
|
|
|
// The start time of this training run.
|
|
google.protobuf.Timestamp start_time = 8;
|
|
|
|
// Output of each iteration run, results.size() <= max_iterations.
|
|
repeated IterationResult results = 6;
|
|
|
|
// The evaluation metrics over training/eval data that were computed at the
|
|
// end of training.
|
|
EvaluationMetrics evaluation_metrics = 7;
|
|
}
|
|
|
|
// Indicates the type of the Model.
|
|
enum ModelType {
|
|
MODEL_TYPE_UNSPECIFIED = 0;
|
|
|
|
// Linear regression model.
|
|
LINEAR_REGRESSION = 1;
|
|
|
|
// Logistic regression model.
|
|
LOGISTIC_REGRESSION = 2;
|
|
|
|
// [Beta] K-means clustering model.
|
|
KMEANS = 3;
|
|
}
|
|
|
|
// Loss metric to evaluate model training performance.
|
|
enum LossType {
|
|
LOSS_TYPE_UNSPECIFIED = 0;
|
|
|
|
// Mean squared loss, used for linear regression.
|
|
MEAN_SQUARED_LOSS = 1;
|
|
|
|
// Mean log loss, used for logistic regression.
|
|
MEAN_LOG_LOSS = 2;
|
|
}
|
|
|
|
// Distance metric used to compute the distance between two points.
|
|
enum DistanceType {
|
|
DISTANCE_TYPE_UNSPECIFIED = 0;
|
|
|
|
// Eculidean distance.
|
|
EUCLIDEAN = 1;
|
|
|
|
// Cosine distance.
|
|
COSINE = 2;
|
|
}
|
|
|
|
// Indicates the method to split input data into multiple tables.
|
|
enum DataSplitMethod {
|
|
DATA_SPLIT_METHOD_UNSPECIFIED = 0;
|
|
|
|
// Splits data randomly.
|
|
RANDOM = 1;
|
|
|
|
// Splits data with the user provided tags.
|
|
CUSTOM = 2;
|
|
|
|
// Splits data sequentially.
|
|
SEQUENTIAL = 3;
|
|
|
|
// Data split will be skipped.
|
|
NO_SPLIT = 4;
|
|
|
|
// Splits data automatically: Uses NO_SPLIT if the data size is small.
|
|
// Otherwise uses RANDOM.
|
|
AUTO_SPLIT = 5;
|
|
}
|
|
|
|
// Indicates the learning rate optimization strategy to use.
|
|
enum LearnRateStrategy {
|
|
LEARN_RATE_STRATEGY_UNSPECIFIED = 0;
|
|
|
|
// Use line search to determine learning rate.
|
|
LINE_SEARCH = 1;
|
|
|
|
// Use a constant learning rate.
|
|
CONSTANT = 2;
|
|
}
|
|
|
|
// Output only. A hash of this resource.
|
|
string etag = 1;
|
|
|
|
// Required. Unique identifier for this model.
|
|
ModelReference model_reference = 2;
|
|
|
|
// Output only. The time when this model was created, in millisecs since the
|
|
// epoch.
|
|
int64 creation_time = 5;
|
|
|
|
// Output only. The time when this model was last modified, in millisecs
|
|
// since the epoch.
|
|
int64 last_modified_time = 6;
|
|
|
|
// [Optional] A user-friendly description of this model.
|
|
// @mutable bigquery.models.patch
|
|
string description = 12;
|
|
|
|
// [Optional] A descriptive name for this model.
|
|
// @mutable bigquery.models.patch
|
|
string friendly_name = 14;
|
|
|
|
// [Optional] The labels associated with this model. You can use these to
|
|
// organize and group your models. Label keys and values can be no longer
|
|
// than 63 characters, can only contain lowercase letters, numeric
|
|
// characters, underscores and dashes. International characters are allowed.
|
|
// Label values are optional. Label keys must start with a letter and each
|
|
// label in the list must have a different key.
|
|
// @mutable bigquery.models.patch
|
|
map<string, string> labels = 15;
|
|
|
|
// [Optional] The time when this model expires, in milliseconds since the
|
|
// epoch. If not present, the model will persist indefinitely. Expired models
|
|
// will be deleted and their storage reclaimed. The defaultTableExpirationMs
|
|
// property of the encapsulating dataset can be used to set a default
|
|
// expirationTime on newly created models.
|
|
// @mutable bigquery.models.patch
|
|
int64 expiration_time = 16;
|
|
|
|
// Output only. The geographic location where the model resides. This value
|
|
// is inherited from the dataset.
|
|
string location = 13;
|
|
|
|
// Output only. Type of the model resource.
|
|
ModelType model_type = 7;
|
|
|
|
// Output only. Information for all training runs in increasing order of
|
|
// start_time.
|
|
repeated TrainingRun training_runs = 9;
|
|
|
|
// Output only. Input feature columns that were used to train this model.
|
|
repeated StandardSqlField feature_columns = 10;
|
|
|
|
// Output only. Label columns that were used to train this model.
|
|
// The output of the model will have a “predicted_” prefix to these columns.
|
|
repeated StandardSqlField label_columns = 11;
|
|
}
|
|
|
|
message GetModelRequest {
|
|
// Project ID of the requested model.
|
|
string project_id = 1;
|
|
|
|
// Dataset ID of the requested model.
|
|
string dataset_id = 2;
|
|
|
|
// Model ID of the requested model.
|
|
string model_id = 3;
|
|
}
|
|
|
|
message PatchModelRequest {
|
|
// Project ID of the model to patch.
|
|
string project_id = 1;
|
|
|
|
// Dataset ID of the model to patch.
|
|
string dataset_id = 2;
|
|
|
|
// Model ID of the model to patch.
|
|
string model_id = 3;
|
|
|
|
// Patched model.
|
|
// Follows patch semantics. Missing fields are not updated. To clear a field,
|
|
// explicitly set to default value.
|
|
Model model = 4;
|
|
}
|
|
|
|
message DeleteModelRequest {
|
|
// Project ID of the model to delete.
|
|
string project_id = 1;
|
|
|
|
// Dataset ID of the model to delete.
|
|
string dataset_id = 2;
|
|
|
|
// Model ID of the model to delete.
|
|
string model_id = 3;
|
|
}
|
|
|
|
message ListModelsRequest {
|
|
// Project ID of the models to list.
|
|
string project_id = 1;
|
|
|
|
// Dataset ID of the models to list.
|
|
string dataset_id = 2;
|
|
|
|
// The maximum number of results per page.
|
|
google.protobuf.UInt32Value max_results = 3;
|
|
|
|
// Page token, returned by a previous call to request the next page of
|
|
// results
|
|
string page_token = 4;
|
|
}
|
|
|
|
message ListModelsResponse {
|
|
// Models in the requested dataset. Only the following fields are populated:
|
|
// model_reference, model_type, creation_time, last_modified_time and
|
|
// labels.
|
|
repeated Model models = 1;
|
|
|
|
// A token to request the next page of results.
|
|
string next_page_token = 2;
|
|
}
|
|
|