api / ui /tab_data.py
safraeli's picture
Deploy: 2026 sensor migration + redesign + bucket B endpoints
13fc29d verified
"""
Data tab.
"""
from __future__ import annotations
import streamlit as st
import pandas as pd
import numpy as np
from config import settings
from ui.bootstrap import _BRAND_GREEN, _HAS_PLOTLY, load_labels, load_metrics
if _HAS_PLOTLY:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def render_tab_data() -> None:
_data_section = st.radio(
"Section",
["Farquhar Model", "Model Validation", "Data Explorer"],
horizontal=True,
)
if _data_section == "Farquhar Model":
st.header("How we measure vine photosynthesis")
st.markdown(
"Before we can predict photosynthesis, we need to **measure** it. "
"On-site sensors record light, temperature, humidity, and CO2 every "
"15 minutes. A well-established plant biology model (Farquhar et al., 1980) "
"converts these readings into the photosynthesis rate **A** — "
"how fast the vine is converting sunlight into sugar. "
"This tab shows the results of that calculation."
)
with st.expander("How does the Farquhar model work?"):
st.markdown(
"Uses the **Farquhar et al. (1980)** mechanistic model with "
"**Greer & Weedon (2012)** grapevine parameters to compute the net leaf "
"photosynthesis rate **A** (\u00b5mol CO\u2082 m\u207b\u00b2 s\u207b\u00b9) from on-site sensor readings "
"(PAR, leaf temperature, air temperature, CO\u2082, VPD). Only daytime rows "
"(PAR > 50) during the growing season (May\u2013Sep) are used. "
"Temperature dependencies use **Bernacchi et al. (2001)** kinetic constants.\n\n"
"The core computation:\n\n"
"1. **Rubisco-limited rate:** Ac = Vcmax \u00b7 (ci \u2212 \u0393*) / (ci + Kc \u00b7 (1 + O\u1d62 / Ko))\n"
"2. **RuBP-limited rate:** Aj = J \u00b7 (ci \u2212 \u0393*) / (4\u00b7ci + 8\u00b7\u0393*)\n"
"3. **Net assimilation:** A = min(Ac, Aj) \u2212 Rd\n\n"
"Where Vcmax and Jmax follow temperature-dependent Arrhenius curves "
"(peak at 39\u00b0C and 36\u00b0C respectively for Semillon), J is solved from the "
"light-response quadratic, and ci (intercellular CO\u2082) is derived from "
"ambient CO\u2082 scaled by stomatal conductance (reduced by VPD and CWSI).\n\n"
"#### How VPD and CWSI reduce photosynthesis\n\n"
"Both VPD and CWSI act on photosynthesis through the same bottleneck: "
"**stomatal conductance (gs)**. Stomata are the pores on the leaf surface "
"that let CO\u2082 in for photosynthesis \u2014 but also let water vapor out.\n\n"
"**VPD (Vapor Pressure Deficit)** measures how dry the air is. "
"When VPD is high (hot, dry air), the leaf would lose water too fast, "
"so the vine partially closes its stomata to conserve water. "
"Less open stomata = less CO\u2082 enters the leaf = lower ci = lower A. "
"The model applies an exponential decay: "
"`gs_scale = exp(\u22120.3 \u00b7 max(0, VPD \u2212 1.0))`, "
"so the effect kicks in above 1 kPa and intensifies with drier air.\n\n"
"**CWSI (Crop Water Stress Index)** is computed from the leaf\u2013air "
"temperature difference: `CWSI = (Tleaf \u2212 Tair \u2212 \u0394Tmin) / (\u0394Tmax \u2212 \u0394Tmin)`, "
"clipped to [0, 1]. A well-watered vine transpires freely, keeping its "
"leaves cooler than the air (CWSI \u2248 0). A stressed vine closes stomata, "
"transpiration drops, and leaves heat up (CWSI \u2192 1). "
"The model reduces stomatal conductance by `(1 \u2212 0.5 \u00b7 CWSI)`, "
"so at full stress (CWSI = 1) stomatal opening is halved.\n\n"
"Combined effect on ci: "
"`ci = CO\u2082 \u00b7 (1 \u2212 1 / (1.6 \u00b7 gs_factor))` where "
"`gs_factor = 1.2 \u00b7 VPD_scale \u00b7 (1 \u2212 0.5 \u00b7 CWSI)`\n\n"
"This means on a hot, dry Negev afternoon (VPD > 3 kPa, CWSI > 0.5), "
"ci drops sharply and photosynthesis can fall even when light is abundant "
"\u2014 the vine has plenty of energy but cannot get enough CO\u2082 through "
"its closed stomata.\n\n"
"**Key references:**\n"
"- [Farquhar, von Caemmerer & Berry (1980)](https://doi.org/10.1007/BF00386231) \u2014 "
"Biochemical model of photosynthetic CO\u2082 assimilation\n"
"- [Greer & Weedon (2012)](https://doi.org/10.1111/j.1365-3040.2011.02471.x) \u2014 "
"Modelling photosynthetic responses to temperature of grapevine\n"
"- [Bernacchi et al. (2001)](https://doi.org/10.1046/j.1365-3040.2001.00668.x) \u2014 "
"Temperature dependence of Kc, Ko, and \u0393*"
)
st.subheader("Sensor inputs")
st.markdown("The Farquhar model uses **6 columns** from the Air1 reference station, "
"sampled every 15 minutes:")
used_sensors_tab = pd.DataFrame([
{"Column": "Air1_PAR_ref", "Measurement": "Photosynthetically Active Radiation", "Units": "\u00b5mol m\u207b\u00b2 s\u207b\u00b9", "Used in": "Farquhar: electron transport (J)", "Notes": "Primary light input driving photosynthesis rate"},
{"Column": "Air1_leafTemperature_ref", "Measurement": "Leaf temperature", "Units": "\u00b0C", "Used in": "Farquhar: Vcmax, Jmax, Kc, Ko, \u0393*; CWSI", "Notes": "Controls enzyme kinetics; also used to compute CWSI"},
{"Column": "Air1_airTemperature_ref", "Measurement": "Air temperature", "Units": "\u00b0C", "Used in": "CWSI computation", "Notes": "Tleaf \u2212 Tair drives the water stress index"},
{"Column": "Air1_CO2_ref", "Measurement": "Ambient CO\u2082 concentration", "Units": "ppm", "Used in": "Farquhar: intercellular CO\u2082 (ci)", "Notes": "Substrate for carbon fixation by Rubisco (\u00d70.7 correction applied)"},
{"Column": "Air1_VPD_ref", "Measurement": "Vapor Pressure Deficit", "Units": "kPa", "Used in": "Farquhar: stomatal conductance \u2192 ci", "Notes": "High VPD closes stomata, reducing ci and thus A"},
{"Column": "Air1_airHumidity_ref", "Measurement": "Relative humidity", "Units": "%", "Used in": "Loaded but not consumed", "Notes": "VPD already encodes humidity; column is redundant"},
])
st.dataframe(used_sensors_tab, hide_index=True)
labels_path = settings.PROCESSED_DIR / "stage1_labels.csv"
validation_img = settings.OUTPUTS_DIR / "stage1_validation.png"
if labels_path.exists():
df_labels = load_labels(str(labels_path))
df_labels.index = pd.to_datetime(df_labels.index, utc=True)
# Metrics row
c1, c2, c3, c4 = st.columns(4)
c1.metric("Observations", f"{len(df_labels):,}")
c2.metric("Avg photosynthesis rate", f"{df_labels.iloc[:, 0].mean():.1f}",
help="Higher values mean the vine is growing faster. Typical range: 5-20.")
c3.metric("Date from", df_labels.index.min().strftime("%Y-%m-%d"))
c4.metric("Date to", df_labels.index.max().strftime("%Y-%m-%d"))
with st.expander("What does this table show?"):
st.markdown(
"Descriptive statistics (count, mean, std, min, quartiles, max) of the "
"computed photosynthesis rate **A**. Typical grapevine values are 0\u201325 "
"\u00b5mol CO\u2082 m\u207b\u00b2 s\u207b\u00b9. Values outside this range may indicate sensor issues."
)
st.dataframe(df_labels.describe())
st.caption(
"This table summarizes the photosynthesis measurements. The 'mean' row shows "
"the average rate across all observations. Values between 5-20 are typical "
"for healthy grapevines during the growing season."
)
st.download_button(
"Download labels CSV",
df_labels.to_csv(),
file_name="stage1_labels.csv",
mime="text/csv",
)
if validation_img.exists():
with st.expander("How to read the validation plots"):
st.markdown(
"**Top panel \u2014 Diurnal pattern:** A vs. hour of day (UTC). Expect a bell curve "
"peaking mid-morning to early afternoon when light and temperature are optimal.\n\n"
"**Bottom panel \u2014 A vs PAR:** Photosynthesis rate plotted against Photosynthetically "
"Active Radiation. A should increase with PAR and saturate at high light levels, "
"forming a characteristic light-response curve."
)
st.image(str(validation_img), width='stretch')
else:
st.info("No pre-computed photosynthesis labels found. Run `python scripts/run_pipeline.py` to generate them.")
# ---------------------------------------------------------------------------
if _data_section == "Model Validation":
st.header("FvCB Model Validation")
st.markdown(
"Comparison of our Farquhar-von Caemmerer-Berry (FvCB) photosynthesis model "
"against measured data from "
"[Greer & Weedon (2012)](https://doi.org/10.1111/j.1365-3040.2011.02471.x) "
"for field-grown *Vitis vinifera* cv. **Semillon** in a hot climate "
"(Riverina, NSW, Australia)."
)
# --- Reference data from Greer & Weedon (2012) ---
_val_temps = [20, 25, 30, 35, 40]
# Fig 5a: Light-saturated Amax at ambient CO2 (389 ppm)
_paper_amax = {20: 12.0, 25: 16.9, 30: 19.9, 35: 15.3, 40: 12.0}
_paper_amax_se = {20: 1.5, 25: 1.3, 30: 1.8, 35: 1.2, 40: 1.5}
# Fig 11: Vcmax and Jmax (from Arrhenius fit to A/Ci curves)
_paper_vcmax = {20: 20, 25: 38.5, 30: 58, 35: 85, 40: 110}
_paper_jmax = {20: 60, 25: 98.3, 30: 135, 35: 165, 40: 170}
# Table 1: Stomatal limitation (%)
_paper_stom_lim = {20: 13, 25: 20, 30: 25, 35: 31, 40: 34}
from src.farquhar_model import FarquharModel as _FMVal
_val_model = _FMVal()
# ---- Section 1: Light Response Curves ----
st.subheader("1. Photosynthetic light response at different temperatures")
st.markdown(
"Light response curves (A vs PFD) at five leaf temperatures. "
"Model run at ambient CO$_2$ = 389 ppm, VPD = 1.5 kPa."
)
_val_vpd = st.slider(
"VPD for model curves (kPa)", 0.5, 3.0, 1.5, 0.1,
key="val_vpd"
)
_val_pfds = np.arange(0, 2100, 25)
if _HAS_PLOTLY:
_lr_fig = go.Figure()
_temp_colors = {20: "#1f77b4", 25: "#2ca02c", 30: "#d62728",
35: "#9467bd", 40: "#ff7f0e"}
for t in _val_temps:
# Model curve
_a_vals = [_val_model.calc_photosynthesis(
PAR=float(p), Tleaf=t, CO2=389, VPD=_val_vpd, Tair=t
) for p in _val_pfds]
_lr_fig.add_trace(go.Scatter(
x=_val_pfds, y=_a_vals, mode="lines",
name=f"{t} °C (model)",
line=dict(color=_temp_colors[t]),
))
# Paper reference point (Amax)
_lr_fig.add_trace(go.Scatter(
x=[1800], y=[_paper_amax[t]],
mode="markers",
name=f"{t} °C (Greer & Weedon)",
marker=dict(color=_temp_colors[t], size=12, symbol="star",
line=dict(width=1, color="black")),
error_y=dict(type="data", array=[_paper_amax_se[t]], visible=True),
showlegend=True,
))
_lr_fig.update_layout(
xaxis_title="PFD [µmol photons m⁻² s⁻¹]",
yaxis_title="A [µmol CO₂ m⁻² s⁻¹]",
height=500,
legend=dict(font=dict(size=10)),
)
st.plotly_chart(_lr_fig)
else:
st.info("Install plotly for interactive charts.")
# ---- Section 2: A comparison table ----
st.subheader("2. Light-saturated A: Model vs Paper")
_rows = []
for t in _val_temps:
a_model = _val_model.calc_photosynthesis(
PAR=2000, Tleaf=t, CO2=389, VPD=_val_vpd, Tair=t
)
_rows.append({
"T_leaf (°C)": t,
"A_model": round(a_model, 1),
f"A_paper (Greer & Weedon)": _paper_amax[t],
"Difference (%)": round((a_model - _paper_amax[t]) / _paper_amax[t] * 100, 0),
"Limitation": "RuBP regen." if t <= 30 else "Rubisco",
})
st.dataframe(pd.DataFrame(_rows), hide_index=True)
# ---- Section 3: Vcmax / Jmax temperature response ----
st.subheader("3. Vcmax and Jmax temperature response")
st.markdown(
"Temperature dependence of maximum carboxylation rate (Vcmax) and "
"electron transport capacity (Jmax). Model uses modified Arrhenius "
"(Medlyn et al. 2002) with Greer & Weedon (2012) activation/deactivation "
"energies. Topt(Vcmax) = 39 °C, Topt(Jmax) = 36 °C."
)
_t_range = np.arange(15, 50, 0.5)
_vcmax_curve = [_val_model.calc_Vcmax(t + 273.15) for t in _t_range]
_jmax_curve = [_val_model.calc_Jmax(t + 273.15) for t in _t_range]
# Scale paper's Cc-based Vcmax/Jmax to our Ci-based values for comparison
_scale_v = _val_model.params["k25_vcmax"] / 38.5 # ratio Ci-based/Cc-based
_scale_j = _val_model.params["k25_jmax"] / 98.3
_paper_vcmax_scaled = {t: v * _scale_v for t, v in _paper_vcmax.items()}
_paper_jmax_scaled = {t: j * _scale_j for t, j in _paper_jmax.items()}
if _HAS_PLOTLY:
_vj_fig = make_subplots(rows=1, cols=2,
subplot_titles=("Vcmax", "Jmax"))
_vj_fig.add_trace(go.Scatter(
x=list(_t_range), y=_vcmax_curve, mode="lines",
name="Vcmax (model)", line=dict(color="#d62728"),
), row=1, col=1)
_vj_fig.add_trace(go.Scatter(
x=list(_paper_vcmax_scaled.keys()),
y=list(_paper_vcmax_scaled.values()),
mode="markers", name="Vcmax (paper, scaled)",
marker=dict(color="#d62728", size=10, symbol="star",
line=dict(width=1, color="black")),
), row=1, col=1)
_vj_fig.add_trace(go.Scatter(
x=list(_t_range), y=_jmax_curve, mode="lines",
name="Jmax (model)", line=dict(color="#1f77b4"),
), row=1, col=2)
_vj_fig.add_trace(go.Scatter(
x=list(_paper_jmax_scaled.keys()),
y=list(_paper_jmax_scaled.values()),
mode="markers", name="Jmax (paper, scaled)",
marker=dict(color="#1f77b4", size=10, symbol="star",
line=dict(width=1, color="black")),
), row=1, col=2)
_vj_fig.update_xaxes(title_text="Leaf temperature (°C)")
_vj_fig.update_yaxes(title_text="µmol m⁻² s⁻¹")
_vj_fig.update_layout(height=400)
st.plotly_chart(_vj_fig)
else:
st.info("Install plotly for interactive charts.")
# ---- Section 4: Limitation regime ----
st.subheader("4. RuBP regeneration vs Rubisco carboxylation limitation")
st.markdown(
"The paper's key finding: **below 30 °C**, photosynthesis is limited by "
"RuBP regeneration (electron transport / light reactions). "
"**Above 30 °C**, Rubisco carboxylation becomes limiting due to "
"declining CO$_2$ affinity and increased photorespiration.\n\n"
"This 30 °C transition is critical for shading decisions:\n"
"- **Below 30 °C**: shading reduces light and hurts photosynthesis (RuBP-limited)\n"
"- **Above 30 °C**: shading may help by reducing heat stress on Rubisco"
)
if _HAS_PLOTLY:
_ac_vals = []
_aj_vals = []
_t_lim = np.arange(15, 46, 0.5)
for t in _t_lim:
Tk = t + 273.15
Vcmax = _val_model.calc_Vcmax(Tk)
Jmax = _val_model.calc_Jmax(Tk)
J = _val_model.calc_electron_transport(2000, Jmax)
gamma = _val_model.calc_gamma_star(Tk)
Kc = _val_model.calc_Kc(Tk)
Ko = _val_model.calc_Ko(Tk)
ci = _val_model._ci_from_ca(389, 1.5, 0.0)
Ac = Vcmax * (ci - gamma) / (ci + Kc * (1 + 210.0 / Ko))
Aj = J * (ci - gamma) / (4 * ci + 8 * gamma)
Rd = 0.015 * Vcmax
_ac_vals.append(Ac - Rd)
_aj_vals.append(Aj - Rd)
_lim_fig = go.Figure()
_lim_fig.add_trace(go.Scatter(
x=list(_t_lim), y=_ac_vals, mode="lines",
name="Ac (Rubisco-limited)",
line=dict(color="#d62728", dash="dash"),
))
_lim_fig.add_trace(go.Scatter(
x=list(_t_lim), y=_aj_vals, mode="lines",
name="Aj (RuBP-limited)",
line=dict(color="#1f77b4", dash="dash"),
))
# Actual A = min(Ac, Aj)
_a_net = [max(0, min(ac, aj)) for ac, aj in zip(_ac_vals, _aj_vals)]
_lim_fig.add_trace(go.Scatter(
x=list(_t_lim), y=_a_net, mode="lines",
name="A_net = min(Ac, Aj)",
line=dict(color="black", width=3),
))
_lim_fig.add_vline(x=30, line_dash="dot", line_color="gray",
annotation_text="30 °C transition")
_lim_fig.update_layout(
xaxis_title="Leaf temperature (°C)",
yaxis_title="A [µmol CO₂ m⁻² s⁻¹]",
height=450,
)
st.plotly_chart(_lim_fig)
# ---- Section 5: Key findings ----
st.subheader("5. Key findings from validation")
st.markdown("""
**Agreement with Greer & Weedon (2012):**
- Temperature ranking of Amax matches: 30 °C > 25 °C > 35 °C > 20 °C > 40 °C
- RuBP/Rubisco limitation transition occurs at ~30-32 °C (paper: 30 °C)
- Vcmax peaks at 39 °C, Jmax peaks at 36 °C (exact match with paper)
- Quantitative match within 2-15% at 20-35 °C
- Jmax/Vcmax ratio declines from ~2.4 at 20 °C to ~1.1 at 45 °C (paper: 3.0 to 1.5)
**Known limitations:**
- At 40 °C, model underestimates A by ~12% due to Bernacchi (2001) Rubisco kinetics
being parameterised for tobacco, not heat-adapted grapevine
- Stomatal response to temperature is modelled via VPD only; the paper shows
direct temperature effects on gs (Table 1: 0.199 at 20 °C to 0.140 at 40 °C)
- The model uses Ci-based (intercellular CO2) calculations; the paper uses
Cc-based (chloroplast CO2) with mesophyll conductance gm = 5-10 µmol m⁻² s⁻¹ Pa⁻¹
**Reference:**
Greer, D.H. & Weedon, M.M. (2012) Modelling photosynthetic responses to temperature
of grapevine (*Vitis vinifera* cv. Semillon) leaves on vines grown in a hot climate.
*Plant, Cell & Environment*, 35, 1050-1064.
[DOI: 10.1111/j.1365-3040.2011.02471.x](https://doi.org/10.1111/j.1365-3040.2011.02471.x)
""")
# ---------------------------------------------------------------------------
if _data_section == "Data Explorer":
st.header("Data Explorer")
st.markdown(
"This tab lets you explore the **raw data** behind the predictions. "
"Choose a data source below:\n\n"
"- **Vineyard sensors** \u2014 Photosynthesis rate **A**, PAR, leaf/air temperature, and \u0394T from on-site crop sensors.\n"
"- **Weather station data** \u2014 IMS station 43 (Sde Boker) and the merged dataset used for ML training.\n"
"- **AI Data Engineering** \u2014 **Gemini-powered** sensor anomaly detection (Z-score/IQR + physical bounds) and engineered features "
"(cyclical time encodings, Stress Risk Score). Run the pipeline and inspect thresholds, cleaning summary, and the daytime stress profile."
)
eda_stage = st.radio(
"Data source",
["Vineyard sensors", "Weather station data", "AI Data Engineering"],
horizontal=True,
label_visibility="visible",
)
if eda_stage == "Vineyard sensors":
st.subheader("Vineyard sensor data")
with st.expander("About this data"):
st.markdown(
"Shows the distribution and temporal patterns of the computed photosynthesis "
"rate **A**, plus the raw sensor inputs used to calculate it. "
"This helps verify that the model produces physiologically plausible values."
)
try:
from scripts.eda import get_stage1_eda
s1 = get_stage1_eda()
except Exception as e:
st.error(str(e))
s1 = {"error": str(e)}
if s1.get("error"):
st.warning(s1["error"])
else:
stats = s1["labels_stats"]
c1, c2, c3, c4 = st.columns(4)
c1.metric("Observations", stats["count"])
c2.metric("Mean A", f"{stats['A_mean']:.2f}")
c3.metric("Std A", f"{stats['A_std']:.2f}")
c4.metric("Range", f"{stats['A_min']:.1f} \u2013 {stats['A_max']:.1f}")
st.caption(f"Date range: {stats['date_min']} to {stats['date_max']}")
if _HAS_PLOTLY and s1.get("labels") is not None:
A = s1["labels"].iloc[:, 0]
with st.expander("About: Distribution of A"):
st.markdown(
"Histogram of all computed A values. A right-skewed distribution is typical: "
"many low-A values (early/late day, cloudy) with a tail of high-A values "
"(midday, full sun). The peak should be between 5\u201315 \u00b5mol m\u207b\u00b2 s\u207b\u00b9 for grapevines."
)
fig = px.histogram(x=A[A >= 1].dropna(), nbins=50, title="Distribution of A (Stage 1 labels, A \u2265 1)")
fig.update_layout(xaxis_title="A (\u00b5mol m\u207b\u00b2 s\u207b\u00b9)", xaxis_range=[1, None])
st.plotly_chart(fig)
with st.expander("About: A over time"):
st.markdown(
"Time series of A across the dataset. Only the **growing season** "
"(May\u2013Sep) is included \u2014 the gaps between clusters represent the "
"dormant months (Oct\u2013Apr) when the vine does not photosynthesize "
"and no data is collected. Within each season, look for diurnal "
"oscillations and any anomalous spikes that may indicate sensor issues."
)
# Resample to daily mean to compress gaps and smooth diurnal noise
A_daily = A.resample("D").mean().dropna()
fig2 = go.Figure()
fig2.add_trace(go.Scatter(
x=A_daily.index, y=A_daily.values, mode="lines",
name="A (daily mean)", line=dict(width=1.5, color=_BRAND_GREEN),
connectgaps=False,
))
fig2.update_layout(
title="A over time (daily mean)",
xaxis_title="Time", yaxis_title="A (\u00b5mol m\u207b\u00b2 s\u207b\u00b9)",
)
st.plotly_chart(fig2)
if s1.get("sensor_sample") is not None and not s1["sensor_sample"].empty and _HAS_PLOTLY:
df = s1["sensor_sample"]
st.subheader("Sensor distributions (daytime PAR > 50)")
with st.expander("About sensor distributions"):
st.markdown(
"Histograms of the main sensor inputs used in the Farquhar model, filtered "
"to daytime only (PAR > 50 \u00b5mol m\u207b\u00b2 s\u207b\u00b9).\n\n"
"- **PAR:** Light energy for photosynthesis (400\u2013700 nm). "
"Values above 2500 are sensor artifacts and are excluded.\n"
"- **Leaf Temp:** Leaf surface temperature (\u00b0C).\n"
"- **Air Temp:** Ambient temperature near the canopy (\u00b0C)."
)
sensor_cols = [c for c in ["Air1_PAR_ref", "Air1_leafTemperature_ref", "Air1_airTemperature_ref"] if c in df.columns]
if sensor_cols:
cols = st.columns(len(sensor_cols))
for col_st, col_name in zip(cols, sensor_cols):
with col_st:
series = df[col_name].dropna()
# Remove PAR outliers (sensor artifacts above 2500)
if col_name == "Air1_PAR_ref":
series = series[series <= 2500]
fig = px.histogram(series, nbins=40, title=col_name.replace("Air1_", "").replace("_ref", ""))
fig.update_layout(height=300)
st.plotly_chart(fig)
# --- Air-Leaf Temperature Delta ---
if "Air1_leafTemperature_ref" in df.columns and "Air1_airTemperature_ref" in df.columns:
st.subheader("Leaf\u2013Air temperature difference (\u0394T)")
with st.expander("Why is \u0394T important?"):
st.markdown(
"The difference between leaf and air temperature "
"(**\u0394T = T_leaf \u2212 T_air**) is a direct indicator of "
"**plant water stress**.\n\n"
"- **\u0394T < 0** (leaf cooler than air): the vine is transpiring "
"normally \u2014 evaporative cooling keeps the leaf below air "
"temperature. The stomata are open and photosynthesis is active.\n"
"- **\u0394T \u2248 0**: transpiration is slowing down.\n"
"- **\u0394T > 0** (leaf warmer than air): the vine has partially "
"or fully closed its stomata due to water stress or extreme VPD. "
"Transpiration has stopped cooling the leaf, so it heats up "
"above ambient. Photosynthesis is severely limited.\n\n"
"This is the basis of the **Crop Water Stress Index (CWSI)** "
"used in the Farquhar model. In agrivoltaics, a rising \u0394T is "
"the signal that the vine would benefit from tracker shading: "
"the extra light cannot be used anyway because the stomata "
"are shut."
)
delta_t = df["Air1_leafTemperature_ref"] - df["Air1_airTemperature_ref"]
delta_t = delta_t.dropna()
col_hist, col_time = st.columns(2)
with col_hist:
fig_dt = px.histogram(
delta_t, nbins=50,
title="\u0394T distribution (daytime)",
color_discrete_sequence=[_BRAND_GREEN],
)
fig_dt.update_layout(
xaxis_title="\u0394T = T_leaf \u2212 T_air (\u00b0C)",
yaxis_title="Count",
height=350,
)
fig_dt.add_vline(x=0, line_dash="dash", line_color="red",
annotation_text="T_leaf = T_air")
st.plotly_chart(fig_dt)
with col_time:
if "time" in df.columns:
# Filter to growing season only (May-Sep)
_ts = pd.to_datetime(df["time"], utc=True)
_grow_mask = _ts.dt.month.isin([5, 6, 7, 8, 9])
_dt_grow = delta_t[_grow_mask]
_ts_grow = _ts[_grow_mask]
fig_dt2 = go.Figure()
fig_dt2.add_trace(go.Scatter(
x=_ts_grow, y=_dt_grow.values,
mode="markers", marker=dict(size=2, color=_BRAND_GREEN, opacity=0.4),
name="\u0394T",
))
fig_dt2.add_hline(y=0, line_dash="dash", line_color="red")
fig_dt2.update_layout(
title="\u0394T over time",
xaxis_title="Time",
yaxis_title="\u0394T (\u00b0C)",
height=350,
)
st.plotly_chart(fig_dt2)
elif eda_stage == "Weather station data":
st.subheader("Weather station data")
with st.expander("About this data"):
st.markdown(
"Shows the IMS weather station data and the merged dataset used for "
"prediction model training. This helps verify data overlap, check for "
"missing values, and understand the weather patterns."
)
try:
from scripts.eda import get_stage2_eda
s2 = get_stage2_eda()
except Exception as e:
st.error(str(e))
s2 = {"error": str(e)}
if s2.get("error"):
st.warning(s2["error"])
else:
stats = s2["stats"]
c1, c2, c3 = st.columns(3)
c1.metric("IMS rows", f"{stats['ims_rows']:,}")
c2.metric("Merged rows", f"{stats['merged_rows']:,}")
c3.metric("Features", len(stats["feature_cols"]))
st.caption(f"IMS range: {stats['ims_date_min']} to {stats['ims_date_max']}")
with st.expander("What are the feature columns?"):
st.markdown(
"IMS weather variables and engineered time features used as "
"ML inputs. No on-site sensor data is included (strict separation to avoid leakage).\n\n"
"- **air_temperature_c, tdmax_c, tdmin_c:** Temperature from IMS station.\n"
"- **ghi_w_m2:** Global Horizontal Irradiance (solar radiation) \u2014 proxy for PAR.\n"
"- **rh_percent:** Relative humidity.\n"
"- **rain_mm:** Precipitation.\n"
"- **wind_speed_ms:** Wind speed.\n"
"- **hour_sin, hour_cos:** Cyclical encoding of hour-of-day.\n"
"- **doy_sin, doy_cos:** Cyclical encoding of day-of-year (seasonality)."
)
merged = s2["merged"]
st.dataframe(merged.describe())
if _HAS_PLOTLY and "A" in merged.columns:
with st.expander("About: Distribution of A (merged)"):
st.markdown(
"Distribution of A in the merged IMS+labels dataset. This is the subset "
"of Stage 1 labels that have matching IMS timestamps. Compare with Stage 1 "
"distribution to check for sampling bias."
)
fig = px.histogram(merged["A"][merged["A"] >= 1].dropna(), nbins=50, title="Distribution of A (merged set, A \u2265 1)")
fig.update_layout(xaxis_title="A (\u00b5mol m\u207b\u00b2 s\u207b\u00b9)", xaxis_range=[1, None])
st.plotly_chart(fig)
if _HAS_PLOTLY and merged is not None:
num_cols = [c for c in stats["feature_cols"] if c in merged.columns][:4]
if num_cols:
with st.expander("About: Feature distributions"):
st.markdown(
"Histograms of the first four numeric IMS features in the merged dataset. "
"Check for: reasonable value ranges, skewness, outliers, and missing-value "
"patterns that might affect model training."
)
fig = make_subplots(rows=2, cols=2, subplot_titles=num_cols)
for i, col in enumerate(num_cols):
r, c = i // 2 + 1, i % 2 + 1
fig.add_trace(go.Histogram(x=merged[col].dropna(), nbinsx=30), row=r, col=c)
fig.update_layout(title="Feature distributions (merged)")
st.plotly_chart(fig)
else:
# ── AI Data Engineering ──────────────────────────────────────────────
st.subheader("AI Data Engineering")
st.markdown(
"Gemini analyzes each sensor column's statistics against known physical constraints "
"for grapevines in the Negev desert, then returns per-column anomaly thresholds "
"(hard bounds + Z-score + IQR multiplier). The pipeline also generates five "
"engineered features fed directly into the ML prediction models."
)
_llm_err = None
try:
from src.llm_data_engineer import LLMDataEngineer, SENSOR_CONTEXT
_loader_path = settings.SENSORS_WIDE_SAMPLE_PATH
if not _loader_path.exists():
_loader_path = settings.SENSORS_WIDE_PATH
if not _loader_path.exists():
raise FileNotFoundError("Sensor data file not found.")
_df_raw = pd.read_csv(_loader_path)
_engineer = LLMDataEngineer(verbose=False)
_key_cols = list(SENSOR_CONTEXT.keys())
_present = [c for c in _key_cols if c in _df_raw.columns]
with st.spinner("Querying Gemini for anomaly thresholds…"):
_thresholds = _engineer.analyze_anomalies(_df_raw, columns=_present)
_df_clean = _engineer.apply_cleaning(_df_raw, _thresholds, strategy="clip")
with st.spinner("Querying Gemini for feature engineering spec…"):
_feat_spec = _engineer.get_feature_spec(list(_df_clean.columns))
_df_eng = _engineer.engineer_features(_df_clean, feature_spec=_feat_spec)
# Violation counts
_viol_before, _viol_after = {}, {}
for _col, _t in _thresholds.items():
if _col not in _df_raw.columns:
continue
_lo, _hi = _t.get("lower_bound"), _t.get("upper_bound")
_m = pd.Series(False, index=_df_raw.index)
if _lo is not None:
_m |= _df_raw[_col] < _lo
if _hi is not None:
_m |= _df_raw[_col] > _hi
_viol_before[_col] = int(_m.sum())
_m2 = pd.Series(False, index=_df_clean.index)
if _lo is not None:
_m2 |= _df_clean[_col] < _lo
if _hi is not None:
_m2 |= _df_clean[_col] > _hi
_viol_after[_col] = int(_m2.sum())
# Stress profile
_df_eng["_hr_local"] = (pd.to_datetime(_df_eng["time"], utc=True).dt.hour + 3) % 24
_daytime = _df_eng[_df_eng["Air1_PAR_ref"] > 50] if "Air1_PAR_ref" in _df_eng.columns else _df_eng
_stress_profile = (
_daytime.groupby("_hr_local")["stress_risk_score"].mean()
.reindex(range(24), fill_value=float("nan"))
)
_used_gemini = "Statistical fallback" not in list(_thresholds.values())[0].get("rationale", "")
except Exception as _exc:
_llm_err = str(_exc)
if _llm_err:
st.error(f"Pipeline error: {_llm_err}")
else:
_source_badge = (
"🤖 Thresholds sourced from **Gemini**"
if _used_gemini
else "⚙️ Thresholds from **statistical fallback** (set `GOOGLE_API_KEY` to enable Gemini)"
)
st.caption(_source_badge)
# ── Section 1: Anomaly thresholds ──────────────────────────────
st.markdown("#### Gemini anomaly thresholds")
with st.expander("How are thresholds generated?"):
st.markdown(
"For each sensor column, `LLMDataEngineer` sends the full descriptive "
"statistics (min, max, percentiles) plus domain context — physical units, "
"expected range for the Negev site, known failure modes — to Gemini. "
"Gemini returns a JSON with:\n\n"
"- **Hard bounds** (`lower_bound` / `upper_bound`): values outside these are "
"physically impossible or known sensor faults.\n"
"- **Z-score threshold**: flags readings that deviate more than N standard "
"deviations from the column mean.\n"
"- **IQR multiplier**: flags readings outside Q1 − k·IQR … Q3 + k·IQR.\n\n"
"A reading is flagged only when the hard-bound violation OR both the Z-score "
"AND IQR conditions are met simultaneously. Default strategy is **clip** "
"(clamp to bounds), preserving row count."
)
_thresh_rows = []
_col_labels = {
"Air1_PAR_ref": "PAR (μmol/m²/s)",
"Air1_leafTemperature_ref": "T_leaf (°C)",
"Air1_airTemperature_ref": "T_air (°C)",
"Air1_VPD_ref": "VPD (kPa)",
"Air1_airHumidity_ref": "Humidity (%)",
"Air1_CO2_ref": "CO₂ raw (ppm)",
}
for _col in _present:
_t = _thresholds.get(_col, {})
_vb = _viol_before.get(_col, 0)
_va = _viol_after.get(_col, 0)
_thresh_rows.append({
"Sensor": _col_labels.get(_col, _col),
"Lower bound": _t.get("lower_bound", "—"),
"Upper bound": _t.get("upper_bound", "—"),
"Z-score σ": _t.get("zscore_threshold", "—"),
"IQR ×": _t.get("iqr_multiplier", "—"),
"Violations (raw)": _vb,
"After clip": _va,
"Rationale": _t.get("rationale", ""),
})
_thresh_df = pd.DataFrame(_thresh_rows)
st.dataframe(_thresh_df, hide_index=True)
# ── Section 2: Cleaning summary metrics ────────────────────────
st.markdown("#### Cleaning summary")
_total_viol = sum(_viol_before.values())
_total_cleared = sum(v for v in _viol_after.values() if v == 0)
_pct_retained = len(_df_clean) / len(_df_raw) * 100
_cm1, _cm2, _cm3, _cm4 = st.columns(4)
_cm1.metric("Rows in dataset", f"{len(_df_raw):,}")
_cm2.metric("Physical violations found", str(_total_viol))
_cm3.metric("Columns fully cleared", f"{_total_cleared} / {len(_viol_before)}")
_cm4.metric("Rows retained (clip)", f"{_pct_retained:.1f}%")
if _HAS_PLOTLY and _total_viol > 0:
_viol_cols = [_col_labels.get(c, c) for c in _viol_before if _viol_before[c] > 0]
_viol_vals = [_viol_before[c] for c in _viol_before if _viol_before[c] > 0]
_fig_viol = px.bar(
x=_viol_cols, y=_viol_vals,
labels={"x": "Sensor", "y": "Violation count"},
title="Physical violations by sensor (before cleaning)",
color_discrete_sequence=[_BRAND_GREEN],
)
_fig_viol.update_layout(height=300)
st.plotly_chart(_fig_viol)
# ── Section 3: Before / after distributions ────────────────────
if _HAS_PLOTLY:
st.markdown("#### Before vs after cleaning — PAR & VPD")
with st.expander("What to look for"):
st.markdown(
"The **raw** histogram (red) includes all sensor readings. "
"The **cleaned** histogram (green) shows the same column after "
"the Gemini-generated thresholds are applied. Outlier spikes at the "
"far right of PAR and VPD should disappear or be clipped to the bound."
)
_ba_cols = st.columns(2)
for _idx, _col in enumerate(["Air1_PAR_ref", "Air1_VPD_ref"]):
if _col not in _df_raw.columns:
continue
_label = _col_labels.get(_col, _col)
_hi_bound = _thresholds.get(_col, {}).get("upper_bound")
_raw_s = _df_raw[_col].dropna()
_clean_s = _df_clean[_col].dropna()
_fig_ba = go.Figure()
_fig_ba.add_trace(go.Histogram(
x=_raw_s, nbinsx=60, name="Raw",
marker_color="crimson", opacity=0.55,
))
_fig_ba.add_trace(go.Histogram(
x=_clean_s, nbinsx=60, name="Cleaned",
marker_color=_BRAND_GREEN, opacity=0.7,
))
if _hi_bound is not None:
_fig_ba.add_vline(
x=_hi_bound, line_dash="dash", line_color="orange",
annotation_text=f"bound={_hi_bound}",
)
_fig_ba.update_layout(
barmode="overlay", title=f"{_label} — raw vs cleaned",
xaxis_title=_label, height=320,
)
with _ba_cols[_idx]:
st.plotly_chart(_fig_ba)
# ── Section 4: Engineered features ─────────────────────────────
st.markdown("#### Engineered features")
with st.expander("How are features engineered?"):
st.markdown(
"After cleaning, the pipeline asks Gemini to confirm the optimal weights "
"and normalisation bounds for the **Stress Risk Score**, given the available "
"sensor columns and the Semillon grapevine stress physiology. "
"It then computes five new columns:\n\n"
"| Feature | Formula | Purpose |\n"
"|---|---|---|\n"
"| `hour_sin` | sin(2π·h/24) | Cyclical hour-of-day |\n"
"| `hour_cos` | cos(2π·h/24) | Cyclical hour-of-day |\n"
"| `doy_sin` | sin(2π·d/365) | Seasonal position |\n"
"| `doy_cos` | cos(2π·d/365) | Seasonal position |\n"
"| `stress_risk_score` | w_VPD·norm(VPD) + w_CWSI·norm(CWSI) | Acute stress in [0, 1] |\n\n"
"Cyclical encodings ensure that midnight→01:00 and 23:00→midnight are "
"treated as equally close by the model — something a raw hour integer cannot do."
)
# Feature spec card
_fs_vpd_w = _feat_spec.get("vpd_weight", "—")
_fs_cwsi_w = _feat_spec.get("cwsi_weight", "—")
_fs_vpd_clip = _feat_spec.get("vpd_clip_max", "—")
_fs_rat = _feat_spec.get("rationale", "")
_fc1, _fc2, _fc3 = st.columns(3)
_fc1.metric("VPD weight", _fs_vpd_w)
_fc2.metric("CWSI weight", _fs_cwsi_w)
_fc3.metric("VPD clip max (kPa)", _fs_vpd_clip)
st.caption(f"Gemini rationale: {_fs_rat}")
# Feature stats table
_eng_feat_cols = ["hour_sin", "hour_cos", "doy_sin", "doy_cos", "stress_risk_score"]
_feat_stats = (
_df_eng[[c for c in _eng_feat_cols if c in _df_eng.columns]]
.describe(percentiles=[0.25, 0.5, 0.75])
.loc[["min", "mean", "max"]]
.round(4)
)
st.dataframe(_feat_stats)
# ── Section 5: Daytime stress profile ──────────────────────────
if _HAS_PLOTLY and "stress_risk_score" in _df_eng.columns:
st.markdown("#### Daytime stress profile")
with st.expander("How to read this chart"):
st.markdown(
"Mean **Stress Risk Score** per local hour (Israel = UTC+3), "
"computed over all daytime readings (PAR > 50 μmol m⁻² s⁻¹). "
"A score of 1.0 means the vine is under maximum atmospheric demand; "
"0.0 means no stress. The midday–afternoon peak is the primary "
"window where SolarWine shading interventions are concentrated."
)
_profile_df = _stress_profile.dropna().reset_index()
_profile_df.columns = ["Hour (local)", "Stress Risk Score"]
_peak_hr = int(_profile_df.loc[_profile_df["Stress Risk Score"].idxmax(), "Hour (local)"])
_fig_stress = px.bar(
_profile_df,
x="Hour (local)", y="Stress Risk Score",
color="Stress Risk Score",
color_continuous_scale=["#00BD3E", "#f5c518", "#e63946"],
range_y=[0, 1],
title=f"Hourly stress profile — peak at {_peak_hr:02d}:00 local",
)
_fig_stress.add_hline(
y=0.5, line_dash="dot", line_color="orange",
annotation_text="Intervention threshold (0.5)",
)
_fig_stress.update_layout(
xaxis=dict(tickmode="linear", dtick=1),
coloraxis_showscale=False,
height=380,
)
st.plotly_chart(_fig_stress)
st.caption(
f"Peak stress: {_peak_hr:02d}:00 local "
f"(score = {_stress_profile.max():.3f}). "
f"Low-stress morning window (before 10:00): "
f"mean score = {_stress_profile.loc[6:9].mean():.3f} — shading withheld."
)
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# Tab — Shading Simulator (was Panel vs Open Sky)
# ---------------------------------------------------------------------------