Skip to main content

Quickstart

Get a basic ConversionOS propensity scoring pipeline running in BigQuery in under an hour.

Prerequisites

  • Google Cloud project with BigQuery enabled
  • Dataset with at least 90 days of customer event data
  • A target conversion event defined (purchase, signup, deposit, etc.)

Step 1: Create the Feature Table

CREATE OR REPLACE TABLE `your_project.conversionos.feature_table` AS
WITH user_events AS (
SELECT
user_id,
event_name,
event_date,
event_timestamp,
-- Behavioral features
COUNTIF(event_name = 'page_view') AS total_pageviews,
COUNTIF(event_name = 'page_view'
AND event_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) AS pageviews_7d,
COUNTIF(event_name = 'page_view'
AND event_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)) AS pageviews_30d,
-- Engagement features
COUNT(DISTINCT event_date) AS active_days,
DATE_DIFF(CURRENT_DATE(), MAX(event_date), DAY) AS days_since_last_activity,
-- Session features
COUNT(DISTINCT session_id) AS total_sessions,
AVG(session_duration_seconds) AS avg_session_duration,
FROM `your_project.analytics.events`
WHERE event_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY)
GROUP BY user_id
),
conversions AS (
SELECT
user_id,
1 AS converted
FROM `your_project.analytics.events`
WHERE event_name = 'purchase' -- Replace with your conversion event
AND event_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY)
GROUP BY user_id
)
SELECT
e.*,
SAFE_DIVIDE(e.pageviews_7d, e.pageviews_30d) AS engagement_recency_ratio,
SAFE_DIVIDE(e.total_sessions, e.active_days) AS sessions_per_active_day,
COALESCE(c.converted, 0) AS label
FROM user_events e
LEFT JOIN conversions c USING(user_id);

Step 2: Train the XGBoost Model

CREATE OR REPLACE MODEL `your_project.conversionos.propensity_model`
OPTIONS(
model_type='BOOSTED_TREE_CLASSIFIER',
input_label_cols=['label'],
num_parallel_tree=5,
max_tree_depth=6,
learn_rate=0.1,
l1_reg=1.0,
l2_reg=1.0,
early_stop=TRUE,
min_split_loss=0.1,
data_split_method='RANDOM',
data_split_eval_fraction=0.2,
enable_global_explain=TRUE
) AS
SELECT
total_pageviews,
pageviews_7d,
pageviews_30d,
active_days,
days_since_last_activity,
total_sessions,
avg_session_duration,
engagement_recency_ratio,
sessions_per_active_day,
label
FROM `your_project.conversionos.feature_table`;

Step 3: Score Prospects

CREATE OR REPLACE TABLE `your_project.conversionos.scored_prospects` AS
SELECT
user_id,
predicted_label,
ROUND(prob, 4) AS propensity_score,
CASE
WHEN prob >= 0.7 THEN 'HIGH'
WHEN prob >= 0.4 THEN 'MEDIUM'
WHEN prob >= 0.15 THEN 'LOW'
ELSE 'EXCLUDE'
END AS propensity_tier,
CASE
WHEN prob >= 0.7 THEN 240
WHEN prob >= 0.4 THEN 50
WHEN prob >= 0.15 THEN 10
ELSE 1
END AS conversion_value
FROM ML.PREDICT(
MODEL `your_project.conversionos.propensity_model`,
(SELECT * FROM `your_project.conversionos.feature_table`)
), UNNEST(predicted_label_probs) AS prob_struct
WHERE prob_struct.label = 1

Step 4: Evaluate Model Performance

SELECT * FROM ML.EVALUATE(
MODEL `your_project.conversionos.propensity_model`,
(SELECT * FROM `your_project.conversionos.feature_table`)
);

Key metrics to monitor:

  • AUC-ROC: Target > 0.80 for production deployment
  • Log loss: Lower is better, compare against baseline
  • Precision/Recall at threshold: Tune based on business tolerance

Step 5: Export to CDP

Push scored prospects to your CDP via:

  • BigQuery → ActionIQ: Native connector, schedule daily sync
  • BigQuery → Segment: Use Reverse ETL (Census, Hightouch)
  • BigQuery → Custom: Export via scheduled query to GCS, pick up via API
# Example: Export scores to GCS for CDP ingestion
from google.cloud import bigquery

client = bigquery.Client()
destination_uri = "gs://your-bucket/conversionos/scores/daily/*.csv"

dataset_ref = bigquery.DatasetReference("your_project", "conversionos")
table_ref = dataset_ref.table("scored_prospects")

extract_job = client.extract_table(
table_ref,
destination_uri,
location="US",
)
extract_job.result()

Next Steps