import tempfile
import time
from datetime import datetime, timedelta
from typing import List
from urllib.parse import urlparse
import fastavro
import pandas as pd
from fastavro import reader as fastavro_reader
from google.cloud import storage
from feast.serving.ServingService_pb2 import GetJobRequest
from feast.serving.ServingService_pb2 import (
Job as JobProto,
JOB_STATUS_DONE,
DATA_FORMAT_AVRO,
)
from feast.serving.ServingService_pb2_grpc import ServingServiceStub
# Maximum no of seconds to wait until the jobs status is DONE in Feast
# Currently set to the maximum query execution time limit in BigQuery
DEFAULT_TIMEOUT_SEC: int = 21600
# Maximum no of seconds to wait before reloading the job status in Feast
MAX_WAIT_INTERVAL_SEC: int = 60
[docs]class Job:
"""
A class representing a job for feature retrieval in Feast.
"""
def __init__(self, job_proto: JobProto, serving_stub: ServingServiceStub):
"""
Args:
job_proto: Job proto object (wrapped by this job object)
serving_stub: Stub for Feast serving service
storage_client: Google Cloud Storage client
"""
self.job_proto = job_proto
self.serving_stub = serving_stub
self.storage_client = storage.Client(project=None)
@property
def id(self):
"""
Getter for the Job Id
"""
return self.job_proto.id
@property
def status(self):
"""
Getter for the Job status from Feast Core
"""
return self.job_proto.status
[docs] def reload(self):
"""
Reload the latest job status
Returns: None
"""
self.job_proto = self.serving_stub.GetJob(GetJobRequest(job=self.job_proto)).job
[docs] def result(self, timeout_sec: int = DEFAULT_TIMEOUT_SEC):
"""
Wait until job is done to get an iterable rows of result.
The row can only represent an Avro row in Feast 0.3.
Args:
timeout_sec: max no of seconds to wait until job is done. If "timeout_sec" is exceeded, an exception will be raised.
Returns: Iterable of Avro rows
"""
max_wait_datetime = datetime.now() + timedelta(seconds=timeout_sec)
wait_duration_sec = 2
while self.status != JOB_STATUS_DONE:
if datetime.now() > max_wait_datetime:
raise Exception(
"Timeout exceeded while waiting for result. Please retry this method or use a longer timeout value."
)
self.reload()
time.sleep(wait_duration_sec)
# Backoff the wait duration exponentially up till MAX_WAIT_INTERVAL_SEC
wait_duration_sec = min(wait_duration_sec * 2, MAX_WAIT_INTERVAL_SEC)
if self.job_proto.error:
raise Exception(self.job_proto.error)
if self.job_proto.data_format != DATA_FORMAT_AVRO:
raise Exception(
"Feast only supports Avro data format for now. Please check "
"your Feast Serving deployment."
)
uris = [urlparse(uri) for uri in self.job_proto.file_uris]
for file_uri in uris:
if file_uri.scheme == "gs":
file_obj = tempfile.TemporaryFile()
self.storage_client.download_blob_to_file(file_uri.geturl(), file_obj)
elif file_uri.scheme == "file":
file_obj = open(file_uri.path, "rb")
else:
raise Exception(
f"Could not identify file URI {file_uri}. Only gs:// and file:// supported"
)
file_obj.seek(0)
avro_reader = fastavro.reader(file_obj)
for record in avro_reader:
yield record
[docs] def to_dataframe(self, timeout_sec: int = DEFAULT_TIMEOUT_SEC):
"""
Wait until job is done to get an interable rows of result
Args:
timeout_sec: max no of seconds to wait until job is done. If "timeout_sec" is exceeded, an exception will be raised.
Returns: pandas Dataframe of the feature values
"""
records = [r for r in self.result(timeout_sec=timeout_sec)]
return pd.DataFrame.from_records(records)
def __iter__(self):
return iter(self.result())