You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
251 lines
11 KiB
251 lines
11 KiB
// Copyright 2018 Google LLC.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
syntax = "proto3";
|
|
|
|
package google.cloud.automl.v1beta1;
|
|
|
|
import "google/api/annotations.proto";
|
|
import "google/cloud/automl/v1beta1/column_spec.proto";
|
|
import "google/cloud/automl/v1beta1/data_stats.proto";
|
|
import "google/cloud/automl/v1beta1/ranges.proto";
|
|
import "google/protobuf/struct.proto";
|
|
import "google/protobuf/timestamp.proto";
|
|
|
|
option go_package = "google.golang.org/genproto/googleapis/cloud/automl/v1beta1;automl";
|
|
option java_multiple_files = true;
|
|
option java_package = "com.google.cloud.automl.v1beta1";
|
|
option php_namespace = "Google\\Cloud\\AutoMl\\V1beta1";
|
|
|
|
|
|
// Metadata for a dataset used for AutoML Tables.
|
|
message TablesDatasetMetadata {
|
|
// Output only. The table_spec_id of the primary table of this dataset.
|
|
string primary_table_spec_id = 1;
|
|
|
|
// column_spec_id of the primary table's column that should be used as the
|
|
// training & prediction target.
|
|
// This column must be non-nullable and have one of following data types
|
|
// (otherwise model creation will error):
|
|
// * CATEGORY
|
|
// * ARRAY(CATEGORY)
|
|
// * FLOAT64
|
|
// Furthermore, if the type is CATEGORY or ARRAY(CATEGORY), then only up to
|
|
// 40 unique values may exist in that column across all rows, but for
|
|
// ARRAY(CATEGORY) unique values are counted as elements of the ARRAY (i.e.
|
|
// following 3 ARRAY-s: [A, B], [A], [B] are counted as having 2 unique
|
|
// values).
|
|
//
|
|
// NOTE: Updates of this field will instantly affect any other users
|
|
// concurrently working with the dataset.
|
|
string target_column_spec_id = 2;
|
|
|
|
// column_spec_id of the primary table's column that should be used as the
|
|
// weight column, i.e. the higher the value the more important the row will be
|
|
// during model training.
|
|
// Required type: FLOAT64.
|
|
// Allowed values: 0 to 10000, inclusive on both ends; 0 means the row is
|
|
// ignored for training.
|
|
// If not set all rows are assumed to have equal weight of 1.
|
|
// NOTE: Updates of this field will instantly affect any other users
|
|
// concurrently working with the dataset.
|
|
string weight_column_spec_id = 3;
|
|
|
|
// column_spec_id of the primary table column which specifies a possible ML
|
|
// use of the row, i.e. the column will be used to split the rows into TRAIN,
|
|
// VALIDATE and TEST sets.
|
|
// Required type: STRING.
|
|
// This column, if set, must either have all of `TRAIN`, `VALIDATE`, `TEST`
|
|
// among its values, or only have `TEST`, `UNASSIGNED` values. In the latter
|
|
// case the rows with `UNASSIGNED` value will be assigned by AutoML. Note
|
|
// that if a given ml use distribution makes it impossible to create a "good"
|
|
// model, that call will error describing the issue.
|
|
// If both this column_spec_id and primary table's time_column_spec_id are not
|
|
// set, then all rows are treated as `UNASSIGNED`.
|
|
// NOTE: Updates of this field will instantly affect any other users
|
|
// concurrently working with the dataset.
|
|
string ml_use_column_spec_id = 4;
|
|
|
|
// Output only. Correlations between
|
|
//
|
|
// [target_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.target_column],
|
|
// and other columns of the
|
|
//
|
|
// [primary_table][google.cloud.automl.v1beta1.TablesDatasetMetadata.primary_table_spec_id].
|
|
// Only set if the target column is set. Mapping from other column spec id to
|
|
// its CorrelationStats with the target column.
|
|
// This field may be stale, see the stats_update_time field for
|
|
// for the timestamp at which these stats were last updated.
|
|
map<string, CorrelationStats> target_column_correlations = 6;
|
|
|
|
// The most recent timestamp when target_column_correlations field and all
|
|
// descendant ColumnSpec.data_stats and ColumnSpec.top_correlated_columns
|
|
// fields were last (re-)generated. Any changes that happened to the dataset
|
|
// afterwards are not reflected in these fields values. The regeneration
|
|
// happens in the background on a best effort basis.
|
|
google.protobuf.Timestamp stats_update_time = 7;
|
|
}
|
|
|
|
// Model metadata specific to AutoML Tables.
|
|
message TablesModelMetadata {
|
|
// Column spec of the dataset's primary table's column the model is
|
|
// predicting. Snapshotted when model creation started.
|
|
// Only 3 fields are used:
|
|
// name - May be set on CreateModel, if it's not then the ColumnSpec
|
|
// corresponding to the current target_column_spec_id of the dataset
|
|
// the model is trained from is used.
|
|
// If neither is set, CreateModel will error.
|
|
// display_name - Output only.
|
|
// data_type - Output only.
|
|
ColumnSpec target_column_spec = 2;
|
|
|
|
// Column specs of the dataset's primary table's columns, on which
|
|
// the model is trained and which are used as the input for predictions.
|
|
// The
|
|
//
|
|
// [target_column][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
|
|
// as well as, according to dataset's state upon model creation,
|
|
//
|
|
// [weight_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.weight_column_spec_id],
|
|
// and
|
|
//
|
|
// [ml_use_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.ml_use_column_spec_id]
|
|
// must never be included here.
|
|
// Only 3 fields are used:
|
|
// name - May be set on CreateModel, if set only the columns specified are
|
|
// used, otherwise all primary table's columns (except the ones listed
|
|
// above) are used for the training and prediction input.
|
|
// display_name - Output only.
|
|
// data_type - Output only.
|
|
repeated ColumnSpec input_feature_column_specs = 3;
|
|
|
|
// Objective function the model is optimizing towards. The training process
|
|
// creates a model that maximizes/minimizes the value of the objective
|
|
// function over the validation set.
|
|
//
|
|
// The supported optimization objectives depend on the prediction_type.
|
|
// If the field is not set, a default objective function is used.
|
|
//
|
|
// CLASSIFICATION_BINARY:
|
|
// "MAXIMIZE_AU_ROC" (default) - Maximize the area under the receiver
|
|
// operating characteristic (ROC) curve.
|
|
// "MINIMIZE_LOG_LOSS" - Minimize log loss.
|
|
// "MAXIMIZE_AU_PRC" - Maximize the area under the precision-recall curve.
|
|
//
|
|
// CLASSIFICATION_MULTI_CLASS :
|
|
// "MINIMIZE_LOG_LOSS" (default) - Minimize log loss.
|
|
//
|
|
// CLASSIFICATION_MULTI_LABEL:
|
|
// "MINIMIZE_LOG_LOSS" (default) - Minimize log loss.
|
|
//
|
|
// REGRESSION:
|
|
// "MINIMIZE_RMSE" (default) - Minimize root-mean-squared error (RMSE).
|
|
// "MINIMIZE_MAE" - Minimize mean-absolute error (MAE).
|
|
// "MINIMIZE_RMSLE" - Minimize root-mean-squared log error (RMSLE).
|
|
//
|
|
// FORECASTING:
|
|
// "MINIMIZE_RMSE" (default) - Minimize root-mean-squared error (RMSE).
|
|
// "MINIMIZE_MAE" - Minimize mean-absolute error (MAE).
|
|
string optimization_objective = 4;
|
|
|
|
// Output only. Auxiliary information for each of the
|
|
// input_feature_column_specs, with respect to this particular model.
|
|
repeated TablesModelColumnInfo tables_model_column_info = 5;
|
|
|
|
// The train budget of creating this model, expressed in milli node hours
|
|
// i.e. 1,000 value in this field means 1 node hour.
|
|
//
|
|
// The training cost of the model will not exceed this budget. The final cost
|
|
// will be attempted to be close to the budget, though may end up being (even)
|
|
// noticeably smaller - at the backend's discretion. This especially may
|
|
// happen when further model training ceases to provide any improvements.
|
|
//
|
|
// If the budget is set to a value known to be insufficient to train a
|
|
// model for the given dataset, the training won't be attempted and
|
|
// will error.
|
|
int64 train_budget_milli_node_hours = 6;
|
|
|
|
// Output only. The actual training cost of the model, expressed in milli
|
|
// node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed
|
|
// to not exceed the train budget.
|
|
int64 train_cost_milli_node_hours = 7;
|
|
}
|
|
|
|
// Contains annotation details specific to Tables.
|
|
message TablesAnnotation {
|
|
// Output only. A confidence estimate between 0.0 and 1.0, inclusive. A higher
|
|
// value means greater confidence in the returned value.
|
|
// For
|
|
//
|
|
// [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
|
|
// of ARRAY(CATEGORY) data type, this is a confidence that one of the values
|
|
// in the ARRAY would be the provided value.
|
|
// For
|
|
//
|
|
// [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
|
|
// of FLOAT64 data type the score is not populated.
|
|
float score = 1;
|
|
|
|
// Output only. Only populated when
|
|
//
|
|
// [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
|
|
// has FLOAT64 data type (i.e. for regression predictions). An interval in
|
|
// which the exactly correct target value has 95% chance to be in.
|
|
DoubleRange prediction_interval = 4;
|
|
|
|
// The predicted value of the row's
|
|
//
|
|
// [target_column][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec].
|
|
// The value depends on the column's DataType:
|
|
// CATEGORY - the predicted (with the above confidence `score`) CATEGORY
|
|
// value.
|
|
// FLOAT64 - the predicted (with the above confidence `score`) FLOAT64 value.
|
|
// ARRAY(CATEGORY) - CATEGORY value meaning that this value would be in the
|
|
// ARRAY in that column (with the above confidence `score`).
|
|
google.protobuf.Value value = 2;
|
|
|
|
// Output only. Auxiliary information for each of the model's
|
|
//
|
|
// [input_feature_column_specs'][google.cloud.automl.v1beta1.TablesModelMetadata.input_feature_column_specs]
|
|
// with respect to this particular prediction.
|
|
repeated TablesModelColumnInfo tables_model_column_info = 3;
|
|
}
|
|
|
|
// An information specific to given column and Tables Model, in context
|
|
// of the Model and the predictions created by it.
|
|
message TablesModelColumnInfo {
|
|
// Output only. The name of the ColumnSpec describing the column. Not
|
|
// populated when this proto is outputted to BigQuery.
|
|
string column_spec_name = 1;
|
|
|
|
// Output only. The display name of the column (same as the display_name of
|
|
// its ColumnSpec).
|
|
string column_display_name = 2;
|
|
|
|
// Output only.
|
|
//
|
|
// When given as part of a Model:
|
|
// Measurement of how much model predictions correctness on the TEST data
|
|
// depend on values in this column. A value between 0 and 1, higher means
|
|
// higher influence. These values are normalized - for all input feature
|
|
// columns of a given model they add to 1.
|
|
//
|
|
// When given back by Predict or Batch Predict:
|
|
// Measurement of how impactful for the prediction returned for the given row
|
|
// the value in this column was. A value between 0 and 1, higher means larger
|
|
// impact. These values are normalized - for all input feature columns of a
|
|
// single predicted row they add to 1.
|
|
float feature_importance = 3;
|
|
}
|
|
|