Skip to content

Commit 69a4f0d

Browse files
snowflake-provisionerSnowflake Authors
andauthored
Project import generated by Copybara. (#18)
GitOrigin-RevId: 288c0c4da10ce230b81b6eb80316011cbb76252b Co-authored-by: Snowflake Authors <[email protected]>
1 parent 86d00b9 commit 69a4f0d

File tree

183 files changed

+5601
-891
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

183 files changed

+5601
-891
lines changed

ci/conda_recipe/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ requirements:
2525
- cloudpickle
2626
- fsspec>=2022.11,<=2023.1
2727
- numpy>=1.23,<2
28-
- packaging>=23.0,<24
28+
- packaging>=20.9,<24
2929
- pandas>=1.0.0,<2 # Limit since 2.x is not available in Snowflake Anaconda Channel yet.
3030
- pyyaml>=6.0,<7
3131
- scikit-learn>=1.2.1,<2

ci/type_ignored_targets

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,32 @@
11
//snowflake/ml/experimental/...
2-
//snowflake/ml/modeling/...
2+
//tests/integ/snowflake/ml/_internal/...
33
//tests/integ/snowflake/ml/extra_tests/...
4-
//tests/integ/snowflake/ml/preprocessing/...
4+
//tests/integ/snowflake/ml/sklearn/preprocessing/...
5+
6+
//snowflake/ml/sklearn/linear_model/...
7+
//snowflake/ml/sklearn/ensemble/...
8+
//snowflake/ml/sklearn/svm/...
9+
//snowflake/ml/sklearn/neural_network/...
10+
//snowflake/ml/sklearn/tree/...
11+
//snowflake/ml/sklearn/calibration/...
12+
//snowflake/ml/sklearn/cluster/...
13+
//snowflake/ml/sklearn/compose/...
14+
//snowflake/ml/sklearn/covariance/...
15+
//snowflake/ml/sklearn/decomposition/...
16+
//snowflake/ml/sklearn/discriminant_analysis/...
17+
//snowflake/ml/sklearn/feature_selection/...
18+
//snowflake/ml/sklearn/gaussian_process/...
19+
//snowflake/ml/sklearn/impute/...
20+
//snowflake/ml/sklearn/isotonic/...
21+
//snowflake/ml/sklearn/kernel_approximation/...
22+
//snowflake/ml/sklearn/kernel_ridge/...
23+
//snowflake/ml/sklearn/manifold/...
24+
//snowflake/ml/sklearn/mixture/...
25+
//snowflake/ml/sklearn/model_selection/...
26+
//snowflake/ml/sklearn/multiclass/...
27+
//snowflake/ml/sklearn/multioutput/...
28+
//snowflake/ml/sklearn/naive_bayes/...
29+
//snowflake/ml/sklearn/neighbors/...
30+
//snowflake/ml/sklearn/semi_supervised/...
31+
//snowflake/ml/xgboost/...
32+
//snowflake/ml/lightgbm/...

codegen/codegen_rules.bzl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,13 @@ def autogen_estimators(module, estimator_info_list):
8282
srcs = [":generate_{}".format(e.normalized_class_name)],
8383
deps = [
8484
":init",
85-
"//snowflake/ml/framework:framework",
85+
"//snowflake/ml/sklearn/framework:framework",
8686
"//snowflake/ml/_internal:telemetry",
8787
"//snowflake/ml/_internal/utils:temp_file_utils",
8888
"//snowflake/ml/_internal/utils:query_result_checker",
8989
"//snowflake/ml/_internal/utils:pkg_version_utils",
9090
"//snowflake/ml/_internal/utils:identifier",
91+
"//snowflake/ml/model:model_signature",
9192
],
9293
tags = ["skip_mypy_check"],
9394
)

codegen/sklearn_wrapper_generator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,9 +311,9 @@ def get_snow_ml_module_name(module_name: str) -> str:
311311
"""
312312
tokens = module_name.split(".")
313313
if tokens[0] == "sklearn":
314-
return "snowflake.ml.modeling." + ".".join(module_name.split(".")[1:])
314+
return "snowflake.ml.sklearn." + ".".join(module_name.split(".")[1:])
315315
else:
316-
return "snowflake.ml.modeling." + module_name
316+
return "snowflake.ml." + module_name
317317

318318
@staticmethod
319319
def can_generate_wrapper(class_object: Tuple[str, type]) -> bool:

codegen/sklearn_wrapper_template.py_template

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import numpy as np
1212
{transform.estimator_imports}
1313
from sklearn.utils.metaestimators import available_if
1414

15-
from snowflake.ml.framework.base import BaseTransformer
15+
from snowflake.ml.sklearn.framework.base import BaseTransformer
1616
from snowflake.ml._internal import telemetry
1717
from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
1818
from snowflake.ml._internal.utils import pkg_version_utils, identifier
@@ -21,6 +21,14 @@ from snowflake.snowpark import DataFrame, Session
2121
from snowflake.snowpark.functions import pandas_udf, sproc
2222
from snowflake.snowpark.types import PandasSeries
2323

24+
from snowflake.ml.model.model_signature import (
25+
DataType,
26+
FeatureSpec,
27+
ModelSignature,
28+
_infer_signature,
29+
_rename_features,
30+
)
31+
2432
_PROJECT = "ModelDevelopment"
2533
# Derive subproject from module name by removing "sklearn"
2634
# and converting module name from underscore to CamelCase
@@ -116,6 +124,7 @@ class {transform.original_class_name}(BaseTransformer):
116124
self._sklearn_object = {transform.root_module_name}.{transform.original_class_name}(
117125
{transform.sklearn_init_arguments}
118126
)
127+
self._model_signature_dict = None
119128
{transform.estimator_init_member_args}
120129

121130
def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
@@ -161,6 +170,7 @@ class {transform.original_class_name}(BaseTransformer):
161170
"Supported dataset types: snowpark.DataFrame, pandas.DataFrame."
162171
)
163172
self._is_fitted = True
173+
self._get_model_signatures(dataset)
164174
return self
165175

166176
def _fit_snowpark(self, dataset: DataFrame) -> None:
@@ -310,9 +320,9 @@ class {transform.original_class_name}(BaseTransformer):
310320
query,
311321
stage_transform_file_name,
312322
stage_result_file_name,
313-
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols),
314-
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols),
315-
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col),
323+
identifier.get_unescaped_names(self.input_cols),
324+
identifier.get_unescaped_names(self.label_cols),
325+
identifier.get_unescaped_names(self.sample_weight_col),
316326
statement_params=statement_params,
317327
)
318328

@@ -378,7 +388,7 @@ class {transform.original_class_name}(BaseTransformer):
378388
# Input columns for UDF are sorted by column names.
379389
# We need actual order of input cols to reorder dataframe before calling inference methods.
380390
input_cols = self.input_cols
381-
unquoted_input_cols = identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols)
391+
unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
382392

383393
statement_params = telemetry.get_function_usage_statement_params(
384394
project=_PROJECT,
@@ -511,9 +521,37 @@ class {transform.original_class_name}(BaseTransformer):
511521
expected_output_cols_list: List[str]
512522
) -> pd.DataFrame:
513523
output_cols = expected_output_cols_list.copy()
514-
transformed_numpy_array = getattr(self._sklearn_object, inference_method)(
515-
dataset[self.input_cols]
524+
525+
# Model expects exact same columns names in the input df for predict call.
526+
# Given the scenario that user use snowpark DataFrame in fit call, but pandas DataFrame in predict call
527+
# input cols need to match unquoted / quoted
528+
input_cols = self.input_cols
529+
unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
530+
531+
estimator = self._sklearn_object
532+
533+
input_df = dataset[input_cols] # Select input columns with quoted column names.
534+
if hasattr(estimator, "feature_names_in_"):
535+
missing_features = []
536+
for i, f in enumerate(getattr(estimator, "feature_names_in_")):
537+
if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
538+
missing_features.append(f)
539+
540+
if len(missing_features) > 0:
541+
raise ValueError(
542+
"The feature names should match with those that were passed during fit.\n"
543+
f"Features seen during fit call but not present in the input: {{missing_features}}\n"
544+
f"Features in the input dataframe : {{input_cols}}\n"
545+
)
546+
input_df.columns = getattr(estimator, "feature_names_in_")
547+
else:
548+
# Just rename the column names to unquoted identifiers.
549+
input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
550+
551+
transformed_numpy_array = getattr(estimator, inference_method)(
552+
input_df
516553
)
554+
517555
if (
518556
isinstance(transformed_numpy_array, list)
519557
and len(transformed_numpy_array) > 0
@@ -974,12 +1012,45 @@ class {transform.original_class_name}(BaseTransformer):
9741012
score_sproc_name,
9751013
query,
9761014
stage_score_file_name,
977-
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols),
978-
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols),
979-
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col),
1015+
identifier.get_unescaped_names(self.input_cols),
1016+
identifier.get_unescaped_names(self.label_cols),
1017+
identifier.get_unescaped_names(self.sample_weight_col),
9801018
statement_params=statement_params,
9811019
)
9821020

9831021
cleanup_temp_files([local_score_file_name])
9841022

9851023
return score
1024+
1025+
def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
1026+
self._model_signature_dict: Dict[str, ModelSignature] = dict()
1027+
1028+
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
1029+
1030+
inputs = _infer_signature(dataset[self.input_cols], "input")
1031+
if hasattr(self, "predict"):
1032+
# For classifier, the type of predict is the same as the type of label
1033+
if self._sklearn_object._estimator_type == 'classifier':
1034+
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1035+
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1036+
self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1037+
# For regressor, the type of predict is float64
1038+
elif self._sklearn_object._estimator_type == 'regressor':
1039+
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1040+
self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1041+
1042+
for prob_func in PROB_FUNCTIONS:
1043+
if hasattr(self, prob_func):
1044+
output_cols_prefix: str = f"{{prob_func}}_"
1045+
output_column_names = self._get_output_column_names(output_cols_prefix)
1046+
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1047+
self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1048+
1049+
##TODO: Add support for transform method
1050+
1051+
1052+
@property
1053+
def model_signatures(self) -> Dict[str, ModelSignature]:
1054+
if self._model_signature_dict is None:
1055+
raise RuntimeError("Estimator not fitted before accessing property model_signatures! ")
1056+
return self._model_signature_dict

conda-env-extended.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
channels:
99
- conda-forge
1010
dependencies:
11-
- moto==4.0.11 # SNOW-690705
1211
- torchdata==0.4.1 # SNOW-702102
1312
# SNOW-747683: Tensorflow is available on snowflake conda channel,
1413
# however, macos-arm64 is only available on conda-forge.

conda-env-snowflake.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@ dependencies:
1717
- boto3==1.24.28
1818
- conda-libmamba-solver==23.1.0
1919
- coverage==6.3.2 # not a package dependency.
20+
- docker-py==4.4.1
2021
- flask-cors==3.0.10
2122
- flask==2.1.3
2223
- fsspec==2022.10.0
2324
- inflection==0.5.1
2425
- joblib==1.1.1
2526
- lightgbm==3.3.5
27+
- moto==4.0.11
2628
- networkx==2.8.4
2729
- numpy==1.23.4
2830
- packaging==23.0
@@ -38,4 +40,4 @@ dependencies:
3840
- sqlparse==0.4.3
3941
- typing-extensions==4.5.0
4042
- xgboost==1.7.3
41-
- mypy==0.981 # not a package dependency.
43+
- mypy==0.981 # not a package dependency.

conda-env.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ dependencies:
1212
- boto3==1.24.28
1313
- conda-libmamba-solver==23.1.0
1414
- coverage==6.3.2
15+
- docker-py==4.4.1
1516
- flask-cors==3.0.10
1617
- flask==2.1.3
1718
- fsspec==2022.10.0

snowflake/ml/BUILD.bazel

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ snowml_wheel(
4141
"cloudpickle", # Version range is specified by snowpark. We are implicitly depending on it.
4242
"fsspec[http]>=2022.11,<=2023.1",
4343
"numpy>=1.23,<2",
44-
"packaging>=23.0,<24",
44+
"packaging>=20.9,<24",
4545
"pandas>=1.0.0,<2", # Limit since 2.x is not available in Snowflake Anaconda Channel yet.
4646
"pyyaml>=6.0,<7",
4747
"scikit-learn>=1.2.1,<2",
@@ -55,37 +55,37 @@ snowml_wheel(
5555
version = VERSION,
5656
deps = [
5757
"//snowflake/ml/metrics:metrics_pkg",
58-
"//snowflake/ml/preprocessing:preprocessing_pkg",
58+
"//snowflake/ml/sklearn/preprocessing:preprocessing_pkg",
5959
"//snowflake/ml/utils:utils_pkg",
6060
"//snowflake/ml/fileset:fileset_pkg",
6161
"//snowflake/ml/registry:model_registry_pkg",
6262
# Auotgen packages
63-
"//snowflake/ml/modeling/linear_model:sklearn_linear_model_pkg",
64-
"//snowflake/ml/modeling/ensemble:sklearn_ensemble_pkg",
65-
"//snowflake/ml/modeling/svm:sklearn_svm_pkg",
66-
"//snowflake/ml/modeling/neural_network:sklearn_neural_network_pkg",
67-
"//snowflake/ml/modeling/tree:sklearn_tree_pkg",
68-
"//snowflake/ml/modeling/xgboost:xgboost_pkg",
69-
"//snowflake/ml/modeling/calibration:sklearn_calibration_pkg",
70-
"//snowflake/ml/modeling/cluster:sklearn_cluster_pkg",
71-
"//snowflake/ml/modeling/compose:sklearn_compose_pkg",
72-
"//snowflake/ml/modeling/covariance:sklearn_covariance_pkg",
73-
"//snowflake/ml/modeling/decomposition:sklearn_decomposition_pkg",
74-
"//snowflake/ml/modeling/discriminant_analysis:sklearn_discriminant_analysis_pkg",
75-
"//snowflake/ml/modeling/feature_selection:sklearn_feature_selection_pkg",
76-
"//snowflake/ml/modeling/gaussian_process:sklearn_gaussian_process_pkg",
77-
"//snowflake/ml/modeling/impute:sklearn_impute_pkg",
78-
"//snowflake/ml/modeling/isotonic:sklearn_isotonic_pkg",
79-
"//snowflake/ml/modeling/kernel_approximation:sklearn_kernel_approximation_pkg",
80-
"//snowflake/ml/modeling/kernel_ridge:sklearn_kernel_ridge_pkg",
81-
"//snowflake/ml/modeling/manifold:sklearn_manifold_pkg",
82-
"//snowflake/ml/modeling/mixture:sklearn_mixture_pkg",
83-
"//snowflake/ml/modeling/model_selection:sklearn_model_selection_pkg",
84-
"//snowflake/ml/modeling/multiclass:sklearn_multiclass_pkg",
85-
"//snowflake/ml/modeling/multioutput:sklearn_multioutput_pkg",
86-
"//snowflake/ml/modeling/naive_bayes:sklearn_naive_bayes_pkg",
87-
"//snowflake/ml/modeling/neighbors:sklearn_neighbors_pkg",
88-
"//snowflake/ml/modeling/semi_supervised:sklearn_semi_supervised_pkg",
89-
"//snowflake/ml/modeling/lightgbm:lightgbm_pkg",
63+
"//snowflake/ml/sklearn/linear_model:sklearn_linear_model_pkg",
64+
"//snowflake/ml/sklearn/ensemble:sklearn_ensemble_pkg",
65+
"//snowflake/ml/sklearn/svm:sklearn_svm_pkg",
66+
"//snowflake/ml/sklearn/neural_network:sklearn_neural_network_pkg",
67+
"//snowflake/ml/sklearn/tree:sklearn_tree_pkg",
68+
"//snowflake/ml/sklearn/calibration:sklearn_calibration_pkg",
69+
"//snowflake/ml/sklearn/cluster:sklearn_cluster_pkg",
70+
"//snowflake/ml/sklearn/compose:sklearn_compose_pkg",
71+
"//snowflake/ml/sklearn/covariance:sklearn_covariance_pkg",
72+
"//snowflake/ml/sklearn/decomposition:sklearn_decomposition_pkg",
73+
"//snowflake/ml/sklearn/discriminant_analysis:sklearn_discriminant_analysis_pkg",
74+
"//snowflake/ml/sklearn/feature_selection:sklearn_feature_selection_pkg",
75+
"//snowflake/ml/sklearn/gaussian_process:sklearn_gaussian_process_pkg",
76+
"//snowflake/ml/sklearn/impute:sklearn_impute_pkg",
77+
"//snowflake/ml/sklearn/isotonic:sklearn_isotonic_pkg",
78+
"//snowflake/ml/sklearn/kernel_approximation:sklearn_kernel_approximation_pkg",
79+
"//snowflake/ml/sklearn/kernel_ridge:sklearn_kernel_ridge_pkg",
80+
"//snowflake/ml/sklearn/manifold:sklearn_manifold_pkg",
81+
"//snowflake/ml/sklearn/mixture:sklearn_mixture_pkg",
82+
"//snowflake/ml/sklearn/model_selection:sklearn_model_selection_pkg",
83+
"//snowflake/ml/sklearn/multiclass:sklearn_multiclass_pkg",
84+
"//snowflake/ml/sklearn/multioutput:sklearn_multioutput_pkg",
85+
"//snowflake/ml/sklearn/naive_bayes:sklearn_naive_bayes_pkg",
86+
"//snowflake/ml/sklearn/neighbors:sklearn_neighbors_pkg",
87+
"//snowflake/ml/sklearn/semi_supervised:sklearn_semi_supervised_pkg",
88+
"//snowflake/ml/xgboost:xgboost_pkg",
89+
"//snowflake/ml/lightgbm:lightgbm_pkg",
9090
],
9191
)

snowflake/ml/_internal/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ py_library(
2727
py_test(
2828
name = "file_utils_test",
2929
srcs = ["file_utils_test.py"],
30-
timeout = "short",
3130
deps = [
3231
":file_utils",
3332
],

0 commit comments

Comments
 (0)