Version: 1.1

Stream Feature View Examples

Row-Level SQL Transformation

from tecton import stream_feature_view, Attribute
from tecton.types import Float64
from fraud.entities import user
from fraud.data_sources.transactions import transactions_stream
from datetime import datetime, timedelta


@stream_feature_view(
    source=transactions_stream,
    entities=[user],
    mode="spark_sql",
    online=True,
    offline=True,
    feature_start_time=datetime(2022, 5, 1),
    batch_schedule=timedelta(days=1),
    ttl=timedelta(days=30),
    description="Last user transaction amount (stream calculated)",
    timestamp_field="timestamp",
    features=[Attribute("amt", Float64)],
)
def last_transaction_amount_sql(transactions):
    return f"""
        SELECT
            timestamp,
            user_id,
            amt
        FROM
            {transactions}
        """

Row-Level PySpark Transformation

from tecton import stream_feature_view, Attribute
from fraud.entities import user
from fraud.data_sources.transactions import transactions_stream
from datetime import datetime, timedelta


@stream_feature_view(
    source=transactions_stream,
    entities=[user],
    mode="pyspark",
    online=True,
    offline=True,
    feature_start_time=datetime(2022, 5, 1),
    batch_schedule=timedelta(days=1),
    ttl=timedelta(days=30),
    description="Last user transaction amount (stream calculated)",
    timestamp_field="timestamp",
    features=[Attribute("amt", Float64)],
)
def last_transaction_amount_pyspark(transactions):
    from pyspark.sql import functions as f

    return transactions.select("timestamp", "user_id", "amt")

Time-Windowed Aggregations

from tecton import stream_feature_view, Aggregate
from fraud.entities import user
from fraud.data_sources.transactions import transactions_stream
from datetime import datetime, timedelta


# The following defines several sliding time window aggregations over a user's transaction amounts
@stream_feature_view(
    source=transactions_stream,
    entities=[user],
    mode="spark_sql",
    aggregation_interval=timedelta(minutes=10),  # Defines how frequently feature values get updated in the online store
    batch_schedule=timedelta(
        days=1
    ),  # Defines how frequently batch jobs are scheduled to ingest into the offline store
    timestamp_field="timestamp",
    features=[
        Aggregate(input_column=Field("amt", Float64), function="sum", time_window=timedelta(hours=1)),
        Aggregate(input_column=Field("amt", Float64), function="sum", time_window=timedelta(days=1)),
        Aggregate(input_column=Field("amt", Float64), function="sum", time_window=timedelta(days=3)),
        Aggregate(input_column=Field("amt", Float64), function="mean", time_window=timedelta(hours=1)),
        Aggregate(input_column=Field("amt", Float64), function="mean", time_window=timedelta(days=1)),
        Aggregate(input_column=Field("amt", Float64), function="mean", time_window=timedelta(days=3)),
    ],
    online=True,
    offline=True,
    feature_start_time=datetime(2022, 5, 1),
    description="Transaction amount statistics and total over a series of time windows, updated every 10 minutes.",
)
def user_transaction_amount_metrics(transactions):
    return f"""
        SELECT
            user_id,
            amt,
            timestamp
        FROM
            {transactions}
        """

Time-Windowed Aggregations with `StreamProcessingMode.CONTINUOUS`

Please see the Stream Processing Mode documentation for details on how StreamProcessingMode works.

Feature Definition Example

Set stream_processing_mode=StreamProcessingMode.CONTINUOUS to enable continuous event processing.
Optionally set instance_availability="on_demand" within the stream_cluster_config. Spot Instances may lead to feature processing delays due to spot termination or looking for an available instance, so On-demand Instances will deliver more consistent performance.

This example Feature View shows how to configure the decorator parameters.

from tecton import stream_feature_view, Aggregate, StreamProcessingMode
from tecton.types import Int32
from fraud.entities import user
from fraud.data_sources.transactions import transactions_stream
from datetime import datetime, timedelta

# The following defines a continuous streaming feature
# It counts the number of non-fraudulent transactions per user over a 1min, 5min and 1h time window
# The expected freshness for these features is ~5 seconds
@stream_feature_view(
    source=transactions_stream,
    entities=[user],
    mode="spark_sql",
    stream_processing_mode=StreamProcessingMode.CONTINUOUS,
    timestamp_field="timestamp",
    features=[
        Aggregate(input_column=Field("transaction", Int32), function="count", time_window=timedelta(minutes=1)),
        Aggregate(input_column=Field("transaction", Int32), function="count", time_window=timedelta(minutes=30)),
        Aggregate(input_column=Field("transaction", Int32), function="count", time_window=timedelta(hours=1)),
    ],
    online=True,
    offline=True,
    feature_start_time=datetime(2022, 5, 1),
    description="Number of transactions a user has made recently",
)
def user_continuous_transaction_count(transactions):
    return f"""
        SELECT
            user_id,
            1 as transaction,
            timestamp
        FROM
            {transactions}
        """

Stream Data Source Configuration

If your stream data source is Kinesis, we suggest lowering the default buffering to avoid delays in event processing.

Here are some suggested parameter values for a KinesisConfig:

For Databricks users:
- maxFetchDuration="200ms"
- maxFetchRate="2"
- minFetchPeriod="200ms"
For EMR users:
- kinesis.executor.idleTimeBetweenReadsInMs ="200"
- kinesis.executor.maxFetchTimeInMs = "200"

This example data source shows how to configure the stream options on a KinesisConfig with Databricks.

from tecton import (
    HiveConfig,
    KinesisConfig,
    StreamSource,
    BatchSource,
    DatetimePartitionColumn,
)
from datetime import timedelta


def raw_data_deserialization(df):
    from pyspark.sql.functions import col, from_json, from_utc_timestamp, when
    from pyspark.sql.types import (
        StructType,
        StructField,
        StringType,
        DoubleType,
        TimestampType,
        BooleanType,
        IntegerType,
    )

    payload_schema = StructType(
        [
            StructField("user_id", StringType(), False),
            StructField("transaction_id", StringType(), False),
            StructField("category", StringType(), False),
            StructField("amt", StringType(), False),
            StructField("is_fraud", StringType(), False),
            StructField("merchant", StringType(), False),
            StructField("merch_lat", StringType(), False),
            StructField("merch_long", StringType(), False),
            StructField("timestamp", StringType(), False),
        ]
    )

    return (
        df.selectExpr("cast (data as STRING) jsonData")
        .select(from_json("jsonData", payload_schema).alias("payload"))
        .select(
            col("payload.user_id").alias("user_id"),
            col("payload.transaction_id").alias("transaction_id"),
            col("payload.category").alias("category"),
            col("payload.amt").cast("double").alias("amt"),
            col("payload.is_fraud").cast("long").alias("is_fraud"),
            col("payload.merchant").alias("merchant"),
            col("payload.merch_lat").cast("double").alias("merch_lat"),
            col("payload.merch_long").cast("double").alias("merch_long"),
            from_utc_timestamp("payload.timestamp", "UTC").alias("timestamp"),
        )
    )


partition_columns = [
    DatetimePartitionColumn(column_name="partition_0", datepart="year", zero_padded=True),
    DatetimePartitionColumn(column_name="partition_1", datepart="month", zero_padded=True),
    DatetimePartitionColumn(column_name="partition_2", datepart="day", zero_padded=True),
]

batch_config = HiveConfig(
    database="demo_fraud_v2",
    table="transactions",
    timestamp_field="timestamp",
    datetime_partition_columns=partition_columns,
)


transactions_stream = StreamSource(
    name="transactions_stream",
    stream_config=KinesisConfig(
        stream_name="tecton-demo-fraud-data-stream",
        region="us-west-2",
        initial_stream_position="latest",
        watermark_delay_threshold=timedelta(hours=24),
        timestamp_field="timestamp",
        post_processor=raw_data_deserialization,
        options={"roleArn": "arn:aws:iam::706752053316:role/tecton-demo-fraud-data-cross-account-kinesis-ro"},
    ),
    batch_config=batch_config,
)

transactions_batch = BatchSource(
    name="transactions_batch",
    batch_config=batch_config,
)

Row-Level SQL Transformation​

Row-Level PySpark Transformation​

Time-Windowed Aggregations​

Time-Windowed Aggregations with StreamProcessingMode.CONTINUOUS​

Feature Definition Example​

Stream Data Source Configuration​

Was this page helpful?