# Copyright 2019 The Feast Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import sys
from collections import OrderedDict
from typing import Dict, Union
from typing import List
import grpc
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from feast.core.CoreService_pb2 import (
GetFeastCoreVersionRequest,
ListFeatureSetsResponse,
ApplyFeatureSetRequest,
ListFeatureSetsRequest,
ApplyFeatureSetResponse,
GetFeatureSetRequest,
GetFeatureSetResponse,
)
from feast.core.CoreService_pb2_grpc import CoreServiceStub
from feast.exceptions import format_grpc_exception
from feast.feature_set import FeatureSet, Entity
from feast.job import Job
from feast.loaders.file import export_dataframe_to_staging_location
from feast.loaders.ingest import ingest_table_to_kafka
from feast.serving.ServingService_pb2 import GetFeastServingInfoResponse
from feast.serving.ServingService_pb2 import (
GetOnlineFeaturesRequest,
GetBatchFeaturesRequest,
GetFeastServingInfoRequest,
GetOnlineFeaturesResponse,
DatasetSource,
DataFormat,
FeatureSetRequest,
FeastServingType,
)
from feast.serving.ServingService_pb2_grpc import ServingServiceStub
_logger = logging.getLogger(__name__)
GRPC_CONNECTION_TIMEOUT_DEFAULT = 3 # type: int
GRPC_CONNECTION_TIMEOUT_APPLY = 600 # type: int
FEAST_SERVING_URL_ENV_KEY = "FEAST_SERVING_URL" # type: str
FEAST_CORE_URL_ENV_KEY = "FEAST_CORE_URL" # type: str
BATCH_FEATURE_REQUEST_WAIT_TIME_SECONDS = 300
CPU_COUNT = os.cpu_count() # type: int
[docs]class Client:
"""
Feast Client: Used for creating, managing, and retrieving features.
"""
def __init__(
self, core_url: str = None, serving_url: str = None, verbose: bool = False
):
"""
The Feast Client should be initialized with at least one service url
Args:
core_url: Feast Core URL. Used to manage features
serving_url: Feast Serving URL. Used to retrieve features
verbose: Enable verbose logging
"""
self._core_url = core_url
self._serving_url = serving_url
self._verbose = verbose
self.__core_channel: grpc.Channel = None
self.__serving_channel: grpc.Channel = None
self._core_service_stub: CoreServiceStub = None
self._serving_service_stub: ServingServiceStub = None
@property
def core_url(self) -> str:
"""
Retrieve Feast Core URL
"""
if self._core_url is not None:
return self._core_url
if os.getenv(FEAST_CORE_URL_ENV_KEY) is not None:
return os.getenv(FEAST_CORE_URL_ENV_KEY)
return ""
@core_url.setter
def core_url(self, value: str):
"""
Set the Feast Core URL
Returns:
Feast Core URL string
"""
self._core_url = value
@property
def serving_url(self) -> str:
"""
Retrieve Serving Core URL
"""
if self._serving_url is not None:
return self._serving_url
if os.getenv(FEAST_SERVING_URL_ENV_KEY) is not None:
return os.getenv(FEAST_SERVING_URL_ENV_KEY)
return ""
@serving_url.setter
def serving_url(self, value: str):
"""
Set the Feast Serving URL
Returns:
Feast Serving URL string
"""
self._serving_url = value
[docs] def version(self):
"""
Returns version information from Feast Core and Feast Serving
"""
self._connect_core()
self._connect_serving()
core_version = ""
serving_version = ""
core_status = "not connected"
serving_status = "not connected"
try:
core_version = self._core_service_stub.GetFeastCoreVersion(
GetFeastCoreVersionRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT
).version
core_status = "connected"
except grpc.RpcError as e:
print(format_grpc_exception("GetFeastCoreVersion", e.code(), e.details()))
try:
serving_version = self._serving_service_stub.GetFeastServingInfo(
GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT
).version
serving_status = "connected"
except grpc.RpcError as e:
print(format_grpc_exception("GetFeastServingInfo", e.code(), e.details()))
return {
"core": {
"url": self.core_url,
"version": core_version,
"status": core_status,
},
"serving": {
"url": self.serving_url,
"version": serving_version,
"status": serving_status,
},
}
def _connect_core(self, skip_if_connected: bool = True):
"""
Connect to Core API
Args:
skip_if_connected: Do not attempt to connect if already connected
"""
if skip_if_connected and self._core_service_stub:
return
if not self.core_url:
raise ValueError("Please set Feast Core URL.")
if self.__core_channel is None:
self.__core_channel = grpc.insecure_channel(self.core_url)
try:
grpc.channel_ready_future(self.__core_channel).result(
timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT
)
except grpc.FutureTimeoutError:
print(
f"Connection timed out while attempting to connect to Feast Core gRPC server {self.core_url}"
)
sys.exit(1)
else:
self._core_service_stub = CoreServiceStub(self.__core_channel)
def _connect_serving(self, skip_if_connected=True):
"""
Connect to Serving API
Args:
skip_if_connected: Do not attempt to connect if already connected
"""
if skip_if_connected and self._serving_service_stub:
return
if not self.serving_url:
raise ValueError("Please set Feast Serving URL.")
if self.__serving_channel is None:
self.__serving_channel = grpc.insecure_channel(self.serving_url)
try:
grpc.channel_ready_future(self.__serving_channel).result(
timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT
)
except grpc.FutureTimeoutError:
print(
f"Connection timed out while attempting to connect to Feast Serving gRPC server {self.serving_url} "
)
sys.exit(1)
else:
self._serving_service_stub = ServingServiceStub(self.__serving_channel)
[docs] def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]):
"""
Idempotently registers feature set(s) with Feast Core. Either a single
feature set or a list can be provided.
Args:
feature_sets: List of feature sets that will be registered
"""
if not isinstance(feature_sets, list):
feature_sets = [feature_sets]
for feature_set in feature_sets:
if isinstance(feature_set, FeatureSet):
self._apply_feature_set(feature_set)
continue
raise ValueError(
f"Could not determine feature set type to apply {feature_set}"
)
def _apply_feature_set(self, feature_set: FeatureSet):
"""
Registers a single feature set with Feast
Args:
feature_set: Feature set that will be registered
"""
self._connect_core()
feature_set._client = self
valid, message = feature_set.is_valid()
if not valid:
raise Exception(message)
try:
# Convert the feature set to a request and send to Feast Core
apply_fs_response = self._core_service_stub.ApplyFeatureSet(
ApplyFeatureSetRequest(feature_set=feature_set.to_proto()),
timeout=GRPC_CONNECTION_TIMEOUT_APPLY,
) # type: ApplyFeatureSetResponse
# Extract the returned feature set
applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set)
# If the feature set has changed, update the local copy
if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED:
print(
f'Feature set updated/created: "{applied_fs.name}:{applied_fs.version}".'
)
# Deep copy from the returned feature set to the local feature set
feature_set._update_from_feature_set(applied_fs, is_dirty=False)
return
# If no change has been applied, do nothing
if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE:
print(f"No change detected in feature set {feature_set.name}")
return
except grpc.RpcError as e:
print(format_grpc_exception("ApplyFeatureSet", e.code(), e.details()))
[docs] def list_feature_sets(self) -> List[FeatureSet]:
"""
Retrieve a list of feature sets from Feast Core
Returns:
List of feature sets
"""
self._connect_core()
try:
# Get latest feature sets from Feast Core
feature_set_protos = self._core_service_stub.ListFeatureSets(
ListFeatureSetsRequest()
) # type: ListFeatureSetsResponse
except grpc.RpcError as e:
raise Exception(
format_grpc_exception("ListFeatureSets", e.code(), e.details())
)
# Extract feature sets and return
feature_sets = []
for feature_set_proto in feature_set_protos.feature_sets:
feature_set = FeatureSet.from_proto(feature_set_proto)
feature_set._client = self
feature_sets.append(feature_set)
return feature_sets
[docs] def get_feature_set(
self, name: str, version: int = None, fail_if_missing: bool = False
) -> Union[FeatureSet, None]:
"""
Retrieves a feature set. If no version is specified then the latest
version will be returned.
Args:
name: Name of feature set
version: Version of feature set
fail_if_missing: Raise an error if feature set is not found
Returns:
Returns either the specified feature set, or None if not found
"""
self._connect_core()
try:
name = name.strip()
if version is None:
version = 0
get_feature_set_response = self._core_service_stub.GetFeatureSet(
GetFeatureSetRequest(name=name, version=version)
) # type: GetFeatureSetResponse
feature_set = get_feature_set_response.feature_set
except grpc.RpcError as e:
print(format_grpc_exception("GetFeatureSet", e.code(), e.details()))
else:
if feature_set is not None:
return FeatureSet.from_proto(feature_set)
if fail_if_missing:
raise Exception(
f'Could not find feature set with name "{name}" and '
f'version "{version}"'
)
[docs] def list_entities(self) -> Dict[str, Entity]:
"""
Returns a dictionary of entities across all feature sets
Returns:
Dictionary of entities, indexed by name
"""
entities_dict = OrderedDict()
for fs in self.list_feature_sets():
for entity in fs.entities:
entities_dict[entity.name] = entity
return entities_dict
[docs] def get_batch_features(
self, feature_ids: List[str], entity_rows: pd.DataFrame
) -> Job:
"""
Retrieves historical features from a Feast Serving deployment.
Args:
feature_ids: List of feature ids that will be returned for each
entity. Each feature id should have the following format
"feature_set_name:version:feature_name".
entity_rows: Pandas dataframe containing entities and a 'datetime'
column. Each entity in a feature set must be present as a column
in this dataframe. The datetime column must
Returns:
Returns a job object that can be used to monitor retrieval progress
asynchronously, and can be used to materialize the results
Examples:
>>> from feast import Client
>>> from datetime import datetime
>>>
>>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566")
>>> feature_ids = ["customer:1:bookings_7d"]
>>> entity_rows = pd.DataFrame(
>>> {
>>> "datetime": [pd.datetime.now() for _ in range(3)],
>>> "customer": [1001, 1002, 1003],
>>> }
>>> )
>>> feature_retrieval_job = feast_client.get_batch_features(feature_ids, entity_rows)
>>> df = feature_retrieval_job.to_dataframe()
>>> print(df)
"""
self._connect_serving()
try:
fs_request = _build_feature_set_request(feature_ids)
# Validate entity rows based on entities in Feast Core
self._validate_entity_rows_for_batch_retrieval(entity_rows, fs_request)
# We want the timestamp column naming to be consistent with the
# rest of Feast
entity_rows.columns = [
"event_timestamp" if col == "datetime" else col
for col in entity_rows.columns
]
# Remove timezone from datetime column
if isinstance(
entity_rows["event_timestamp"].dtype,
pd.core.dtypes.dtypes.DatetimeTZDtype,
):
entity_rows["event_timestamp"] = pd.DatetimeIndex(
entity_rows["event_timestamp"]
).tz_localize(None)
# Retrieve serving information to determine store type and
# staging location
serving_info = self._serving_service_stub.GetFeastServingInfo(
GetFeastServingInfoRequest(), timeout=GRPC_CONNECTION_TIMEOUT_DEFAULT
) # type: GetFeastServingInfoResponse
if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH:
raise Exception(
f'You are connected to a store "{self._serving_url}" which '
f"does not support batch retrieval "
)
# Export and upload entity row dataframe to staging location
# provided by Feast
staged_file = export_dataframe_to_staging_location(
entity_rows, serving_info.job_staging_location
) # type: str
request = GetBatchFeaturesRequest(
feature_sets=fs_request,
dataset_source=DatasetSource(
file_source=DatasetSource.FileSource(
file_uris=[staged_file], data_format=DataFormat.DATA_FORMAT_AVRO
)
),
)
# Retrieve Feast Job object to manage life cycle of retrieval
response = self._serving_service_stub.GetBatchFeatures(request)
return Job(response.job, self._serving_service_stub)
except grpc.RpcError as e:
print(format_grpc_exception("GetBatchFeatures", e.code(), e.details()))
def _validate_entity_rows_for_batch_retrieval(
self, entity_rows, feature_sets_request
):
"""
Validate whether an entity_row dataframe contains the correct
information for batch retrieval
Args:
entity_rows: Pandas dataframe containing entities and datetime
column. Each entity in a feature set must be present as a
column in this dataframe.
feature_sets_request: Feature sets that will be requested
"""
# Ensure datetime column exists
if "datetime" not in entity_rows.columns:
raise ValueError(
f'Entity rows does not contain "datetime" column in columns '
f"{entity_rows.columns}"
)
# Validate dataframe columns based on feature set entities
for feature_set in feature_sets_request:
fs = self.get_feature_set(
name=feature_set.name, version=feature_set.version
)
if fs is None:
raise ValueError(
f'Feature set "{feature_set.name}:{feature_set.version}" '
f"could not be found"
)
for entity_type in fs.entities:
if entity_type.name not in entity_rows.columns:
raise ValueError(
f'Dataframe does not contain entity "{entity_type.name}"'
f' column in columns "{entity_rows.columns}"'
)
[docs] def get_online_features(
self,
feature_ids: List[str],
entity_rows: List[GetOnlineFeaturesRequest.EntityRow],
) -> GetOnlineFeaturesResponse:
"""
Retrieves the latest online feature data from Feast Serving
Args:
feature_ids: List of feature Ids in the following format
[feature_set_name]:[version]:[feature_name]
example:
["feature_set_1:6:my_feature_1",
"feature_set_1:6:my_feature_2",]
entity_rows: List of GetFeaturesRequest.EntityRow where each row
contains entities. Timestamp should not be set for online
retrieval. All entity types within a feature
Returns:
Returns a list of maps where each item in the list contains the
latest feature values for the provided entities
"""
self._connect_serving()
try:
response = self._serving_service_stub.GetOnlineFeatures(
GetOnlineFeaturesRequest(
feature_sets=_build_feature_set_request(feature_ids),
entity_rows=entity_rows,
)
) # type: GetOnlineFeaturesResponse
except grpc.RpcError as e:
print(format_grpc_exception("GetOnlineFeatures", e.code(), e.details()))
else:
return response
[docs] def ingest(
self,
feature_set: Union[str, FeatureSet],
source: Union[pd.DataFrame, str],
version: int = None,
force_update: bool = False,
max_workers: int = CPU_COUNT,
disable_progress_bar: bool = False,
chunk_size: int = 5000,
timeout: int = None,
):
"""
Loads feature data into Feast for a specific feature set.
Args:
feature_set: Name of feature set or a feature set object
source: Either a file path or Pandas Dataframe to ingest into Feast
Files that are currently supported:
* parquet
* csv
* json
version: Feature set version
force_update: Automatically update feature set based on source data
prior to ingesting. This will also register changes to Feast
max_workers: Number of worker processes to use to encode values
disable_progress_bar: Disable printing of progress statistics
chunk_size: Maximum amount of rows to load into memory and ingest at
a time
timeout: Seconds to wait before ingestion times out
"""
if isinstance(feature_set, FeatureSet):
name = feature_set.name
if version is None:
version = feature_set.version
elif isinstance(feature_set, str):
name = feature_set
else:
raise Exception(f"Feature set name must be provided")
table = _read_table_from_source(source)
# Update the feature set based on DataFrame schema
if force_update:
# Use a small as reference DataFrame to infer fields
ref_df = table.to_batches(max_chunksize=20)[0].to_pandas()
feature_set.infer_fields_from_df(
ref_df, discard_unused_fields=True, replace_existing_features=True
)
self.apply(feature_set)
feature_set = self.get_feature_set(name, version, fail_if_missing=True)
if feature_set.source.source_type == "Kafka":
ingest_table_to_kafka(
feature_set=feature_set,
table=table,
max_workers=max_workers,
disable_pbar=disable_progress_bar,
chunk_size=chunk_size,
timeout=timeout,
)
else:
raise Exception(
f"Could not determine source type for feature set "
f'"{feature_set.name}" with source type '
f'"{feature_set.source.source_type}"'
)
def _build_feature_set_request(feature_ids: List[str]) -> List[FeatureSetRequest]:
"""
Builds a list of FeatureSet objects from feature set ids in order to
retrieve feature data from Feast Serving
Args:
feature_ids: List of feature ids
("feature_set_name:version:feature_name")
"""
feature_set_request = dict() # type: Dict[str, FeatureSetRequest]
for feature_id in feature_ids:
fid_parts = feature_id.split(":")
if len(fid_parts) == 3:
feature_set, version, feature = fid_parts
else:
raise ValueError(
f"Could not parse feature id ${feature_id}, needs 2 colons"
)
if feature_set not in feature_set_request:
feature_set_request[feature_set] = FeatureSetRequest(
name=feature_set, version=int(version)
)
feature_set_request[feature_set].feature_names.append(feature)
return list(feature_set_request.values())
def _read_table_from_source(source: Union[pd.DataFrame, str]) -> pa.lib.Table:
"""
Infers a data source type (path or Pandas Dataframe) and reads it in as
a PyArrow Table.
Args:
source: Either a string path or Pandas Dataframe
Returns:
PyArrow table
"""
# Pandas dataframe detected
if isinstance(source, pd.DataFrame):
table = pa.Table.from_pandas(df=source)
# Inferring a string path
elif isinstance(source, str):
file_path = source
filename, file_ext = os.path.splitext(file_path)
if ".csv" in file_ext:
from pyarrow import csv
table = csv.read_csv(filename)
elif ".json" in file_ext:
from pyarrow import json
table = json.read_json(filename)
else:
table = pq.read_table(file_path)
else:
raise ValueError(f"Unknown data source provided for ingestion: {source}")
# Ensure that PyArrow table is initialised
assert isinstance(table, pa.lib.Table)
return table