""" Data tab. """ from __future__ import annotations import streamlit as st import pandas as pd import numpy as np from config import settings from ui.bootstrap import _BRAND_GREEN, _HAS_PLOTLY, load_labels, load_metrics if _HAS_PLOTLY: import plotly.graph_objects as go from plotly.subplots import make_subplots def render_tab_data() -> None: _data_section = st.radio( "Section", ["Farquhar Model", "Model Validation", "Data Explorer"], horizontal=True, ) if _data_section == "Farquhar Model": st.header("How we measure vine photosynthesis") st.markdown( "Before we can predict photosynthesis, we need to **measure** it. " "On-site sensors record light, temperature, humidity, and CO2 every " "15 minutes. A well-established plant biology model (Farquhar et al., 1980) " "converts these readings into the photosynthesis rate **A** — " "how fast the vine is converting sunlight into sugar. " "This tab shows the results of that calculation." ) with st.expander("How does the Farquhar model work?"): st.markdown( "Uses the **Farquhar et al. (1980)** mechanistic model with " "**Greer & Weedon (2012)** grapevine parameters to compute the net leaf " "photosynthesis rate **A** (\u00b5mol CO\u2082 m\u207b\u00b2 s\u207b\u00b9) from on-site sensor readings " "(PAR, leaf temperature, air temperature, CO\u2082, VPD). Only daytime rows " "(PAR > 50) during the growing season (May\u2013Sep) are used. " "Temperature dependencies use **Bernacchi et al. (2001)** kinetic constants.\n\n" "The core computation:\n\n" "1. **Rubisco-limited rate:** Ac = Vcmax \u00b7 (ci \u2212 \u0393*) / (ci + Kc \u00b7 (1 + O\u1d62 / Ko))\n" "2. **RuBP-limited rate:** Aj = J \u00b7 (ci \u2212 \u0393*) / (4\u00b7ci + 8\u00b7\u0393*)\n" "3. **Net assimilation:** A = min(Ac, Aj) \u2212 Rd\n\n" "Where Vcmax and Jmax follow temperature-dependent Arrhenius curves " "(peak at 39\u00b0C and 36\u00b0C respectively for Semillon), J is solved from the " "light-response quadratic, and ci (intercellular CO\u2082) is derived from " "ambient CO\u2082 scaled by stomatal conductance (reduced by VPD and CWSI).\n\n" "#### How VPD and CWSI reduce photosynthesis\n\n" "Both VPD and CWSI act on photosynthesis through the same bottleneck: " "**stomatal conductance (gs)**. Stomata are the pores on the leaf surface " "that let CO\u2082 in for photosynthesis \u2014 but also let water vapor out.\n\n" "**VPD (Vapor Pressure Deficit)** measures how dry the air is. " "When VPD is high (hot, dry air), the leaf would lose water too fast, " "so the vine partially closes its stomata to conserve water. " "Less open stomata = less CO\u2082 enters the leaf = lower ci = lower A. " "The model applies an exponential decay: " "`gs_scale = exp(\u22120.3 \u00b7 max(0, VPD \u2212 1.0))`, " "so the effect kicks in above 1 kPa and intensifies with drier air.\n\n" "**CWSI (Crop Water Stress Index)** is computed from the leaf\u2013air " "temperature difference: `CWSI = (Tleaf \u2212 Tair \u2212 \u0394Tmin) / (\u0394Tmax \u2212 \u0394Tmin)`, " "clipped to [0, 1]. A well-watered vine transpires freely, keeping its " "leaves cooler than the air (CWSI \u2248 0). A stressed vine closes stomata, " "transpiration drops, and leaves heat up (CWSI \u2192 1). " "The model reduces stomatal conductance by `(1 \u2212 0.5 \u00b7 CWSI)`, " "so at full stress (CWSI = 1) stomatal opening is halved.\n\n" "Combined effect on ci: " "`ci = CO\u2082 \u00b7 (1 \u2212 1 / (1.6 \u00b7 gs_factor))` where " "`gs_factor = 1.2 \u00b7 VPD_scale \u00b7 (1 \u2212 0.5 \u00b7 CWSI)`\n\n" "This means on a hot, dry Negev afternoon (VPD > 3 kPa, CWSI > 0.5), " "ci drops sharply and photosynthesis can fall even when light is abundant " "\u2014 the vine has plenty of energy but cannot get enough CO\u2082 through " "its closed stomata.\n\n" "**Key references:**\n" "- [Farquhar, von Caemmerer & Berry (1980)](https://doi.org/10.1007/BF00386231) \u2014 " "Biochemical model of photosynthetic CO\u2082 assimilation\n" "- [Greer & Weedon (2012)](https://doi.org/10.1111/j.1365-3040.2011.02471.x) \u2014 " "Modelling photosynthetic responses to temperature of grapevine\n" "- [Bernacchi et al. (2001)](https://doi.org/10.1046/j.1365-3040.2001.00668.x) \u2014 " "Temperature dependence of Kc, Ko, and \u0393*" ) st.subheader("Sensor inputs") st.markdown("The Farquhar model uses **6 columns** from the Air1 reference station, " "sampled every 15 minutes:") used_sensors_tab = pd.DataFrame([ {"Column": "Air1_PAR_ref", "Measurement": "Photosynthetically Active Radiation", "Units": "\u00b5mol m\u207b\u00b2 s\u207b\u00b9", "Used in": "Farquhar: electron transport (J)", "Notes": "Primary light input driving photosynthesis rate"}, {"Column": "Air1_leafTemperature_ref", "Measurement": "Leaf temperature", "Units": "\u00b0C", "Used in": "Farquhar: Vcmax, Jmax, Kc, Ko, \u0393*; CWSI", "Notes": "Controls enzyme kinetics; also used to compute CWSI"}, {"Column": "Air1_airTemperature_ref", "Measurement": "Air temperature", "Units": "\u00b0C", "Used in": "CWSI computation", "Notes": "Tleaf \u2212 Tair drives the water stress index"}, {"Column": "Air1_CO2_ref", "Measurement": "Ambient CO\u2082 concentration", "Units": "ppm", "Used in": "Farquhar: intercellular CO\u2082 (ci)", "Notes": "Substrate for carbon fixation by Rubisco (\u00d70.7 correction applied)"}, {"Column": "Air1_VPD_ref", "Measurement": "Vapor Pressure Deficit", "Units": "kPa", "Used in": "Farquhar: stomatal conductance \u2192 ci", "Notes": "High VPD closes stomata, reducing ci and thus A"}, {"Column": "Air1_airHumidity_ref", "Measurement": "Relative humidity", "Units": "%", "Used in": "Loaded but not consumed", "Notes": "VPD already encodes humidity; column is redundant"}, ]) st.dataframe(used_sensors_tab, hide_index=True) labels_path = settings.PROCESSED_DIR / "stage1_labels.csv" validation_img = settings.OUTPUTS_DIR / "stage1_validation.png" if labels_path.exists(): df_labels = load_labels(str(labels_path)) df_labels.index = pd.to_datetime(df_labels.index, utc=True) # Metrics row c1, c2, c3, c4 = st.columns(4) c1.metric("Observations", f"{len(df_labels):,}") c2.metric("Avg photosynthesis rate", f"{df_labels.iloc[:, 0].mean():.1f}", help="Higher values mean the vine is growing faster. Typical range: 5-20.") c3.metric("Date from", df_labels.index.min().strftime("%Y-%m-%d")) c4.metric("Date to", df_labels.index.max().strftime("%Y-%m-%d")) with st.expander("What does this table show?"): st.markdown( "Descriptive statistics (count, mean, std, min, quartiles, max) of the " "computed photosynthesis rate **A**. Typical grapevine values are 0\u201325 " "\u00b5mol CO\u2082 m\u207b\u00b2 s\u207b\u00b9. Values outside this range may indicate sensor issues." ) st.dataframe(df_labels.describe()) st.caption( "This table summarizes the photosynthesis measurements. The 'mean' row shows " "the average rate across all observations. Values between 5-20 are typical " "for healthy grapevines during the growing season." ) st.download_button( "Download labels CSV", df_labels.to_csv(), file_name="stage1_labels.csv", mime="text/csv", ) if validation_img.exists(): with st.expander("How to read the validation plots"): st.markdown( "**Top panel \u2014 Diurnal pattern:** A vs. hour of day (UTC). Expect a bell curve " "peaking mid-morning to early afternoon when light and temperature are optimal.\n\n" "**Bottom panel \u2014 A vs PAR:** Photosynthesis rate plotted against Photosynthetically " "Active Radiation. A should increase with PAR and saturate at high light levels, " "forming a characteristic light-response curve." ) st.image(str(validation_img), width='stretch') else: st.info("No pre-computed photosynthesis labels found. Run `python scripts/run_pipeline.py` to generate them.") # --------------------------------------------------------------------------- if _data_section == "Model Validation": st.header("FvCB Model Validation") st.markdown( "Comparison of our Farquhar-von Caemmerer-Berry (FvCB) photosynthesis model " "against measured data from " "[Greer & Weedon (2012)](https://doi.org/10.1111/j.1365-3040.2011.02471.x) " "for field-grown *Vitis vinifera* cv. **Semillon** in a hot climate " "(Riverina, NSW, Australia)." ) # --- Reference data from Greer & Weedon (2012) --- _val_temps = [20, 25, 30, 35, 40] # Fig 5a: Light-saturated Amax at ambient CO2 (389 ppm) _paper_amax = {20: 12.0, 25: 16.9, 30: 19.9, 35: 15.3, 40: 12.0} _paper_amax_se = {20: 1.5, 25: 1.3, 30: 1.8, 35: 1.2, 40: 1.5} # Fig 11: Vcmax and Jmax (from Arrhenius fit to A/Ci curves) _paper_vcmax = {20: 20, 25: 38.5, 30: 58, 35: 85, 40: 110} _paper_jmax = {20: 60, 25: 98.3, 30: 135, 35: 165, 40: 170} # Table 1: Stomatal limitation (%) _paper_stom_lim = {20: 13, 25: 20, 30: 25, 35: 31, 40: 34} from src.farquhar_model import FarquharModel as _FMVal _val_model = _FMVal() # ---- Section 1: Light Response Curves ---- st.subheader("1. Photosynthetic light response at different temperatures") st.markdown( "Light response curves (A vs PFD) at five leaf temperatures. " "Model run at ambient CO$_2$ = 389 ppm, VPD = 1.5 kPa." ) _val_vpd = st.slider( "VPD for model curves (kPa)", 0.5, 3.0, 1.5, 0.1, key="val_vpd" ) _val_pfds = np.arange(0, 2100, 25) if _HAS_PLOTLY: _lr_fig = go.Figure() _temp_colors = {20: "#1f77b4", 25: "#2ca02c", 30: "#d62728", 35: "#9467bd", 40: "#ff7f0e"} for t in _val_temps: # Model curve _a_vals = [_val_model.calc_photosynthesis( PAR=float(p), Tleaf=t, CO2=389, VPD=_val_vpd, Tair=t ) for p in _val_pfds] _lr_fig.add_trace(go.Scatter( x=_val_pfds, y=_a_vals, mode="lines", name=f"{t} °C (model)", line=dict(color=_temp_colors[t]), )) # Paper reference point (Amax) _lr_fig.add_trace(go.Scatter( x=[1800], y=[_paper_amax[t]], mode="markers", name=f"{t} °C (Greer & Weedon)", marker=dict(color=_temp_colors[t], size=12, symbol="star", line=dict(width=1, color="black")), error_y=dict(type="data", array=[_paper_amax_se[t]], visible=True), showlegend=True, )) _lr_fig.update_layout( xaxis_title="PFD [µmol photons m⁻² s⁻¹]", yaxis_title="A [µmol CO₂ m⁻² s⁻¹]", height=500, legend=dict(font=dict(size=10)), ) st.plotly_chart(_lr_fig) else: st.info("Install plotly for interactive charts.") # ---- Section 2: A comparison table ---- st.subheader("2. Light-saturated A: Model vs Paper") _rows = [] for t in _val_temps: a_model = _val_model.calc_photosynthesis( PAR=2000, Tleaf=t, CO2=389, VPD=_val_vpd, Tair=t ) _rows.append({ "T_leaf (°C)": t, "A_model": round(a_model, 1), f"A_paper (Greer & Weedon)": _paper_amax[t], "Difference (%)": round((a_model - _paper_amax[t]) / _paper_amax[t] * 100, 0), "Limitation": "RuBP regen." if t <= 30 else "Rubisco", }) st.dataframe(pd.DataFrame(_rows), hide_index=True) # ---- Section 3: Vcmax / Jmax temperature response ---- st.subheader("3. Vcmax and Jmax temperature response") st.markdown( "Temperature dependence of maximum carboxylation rate (Vcmax) and " "electron transport capacity (Jmax). Model uses modified Arrhenius " "(Medlyn et al. 2002) with Greer & Weedon (2012) activation/deactivation " "energies. Topt(Vcmax) = 39 °C, Topt(Jmax) = 36 °C." ) _t_range = np.arange(15, 50, 0.5) _vcmax_curve = [_val_model.calc_Vcmax(t + 273.15) for t in _t_range] _jmax_curve = [_val_model.calc_Jmax(t + 273.15) for t in _t_range] # Scale paper's Cc-based Vcmax/Jmax to our Ci-based values for comparison _scale_v = _val_model.params["k25_vcmax"] / 38.5 # ratio Ci-based/Cc-based _scale_j = _val_model.params["k25_jmax"] / 98.3 _paper_vcmax_scaled = {t: v * _scale_v for t, v in _paper_vcmax.items()} _paper_jmax_scaled = {t: j * _scale_j for t, j in _paper_jmax.items()} if _HAS_PLOTLY: _vj_fig = make_subplots(rows=1, cols=2, subplot_titles=("Vcmax", "Jmax")) _vj_fig.add_trace(go.Scatter( x=list(_t_range), y=_vcmax_curve, mode="lines", name="Vcmax (model)", line=dict(color="#d62728"), ), row=1, col=1) _vj_fig.add_trace(go.Scatter( x=list(_paper_vcmax_scaled.keys()), y=list(_paper_vcmax_scaled.values()), mode="markers", name="Vcmax (paper, scaled)", marker=dict(color="#d62728", size=10, symbol="star", line=dict(width=1, color="black")), ), row=1, col=1) _vj_fig.add_trace(go.Scatter( x=list(_t_range), y=_jmax_curve, mode="lines", name="Jmax (model)", line=dict(color="#1f77b4"), ), row=1, col=2) _vj_fig.add_trace(go.Scatter( x=list(_paper_jmax_scaled.keys()), y=list(_paper_jmax_scaled.values()), mode="markers", name="Jmax (paper, scaled)", marker=dict(color="#1f77b4", size=10, symbol="star", line=dict(width=1, color="black")), ), row=1, col=2) _vj_fig.update_xaxes(title_text="Leaf temperature (°C)") _vj_fig.update_yaxes(title_text="µmol m⁻² s⁻¹") _vj_fig.update_layout(height=400) st.plotly_chart(_vj_fig) else: st.info("Install plotly for interactive charts.") # ---- Section 4: Limitation regime ---- st.subheader("4. RuBP regeneration vs Rubisco carboxylation limitation") st.markdown( "The paper's key finding: **below 30 °C**, photosynthesis is limited by " "RuBP regeneration (electron transport / light reactions). " "**Above 30 °C**, Rubisco carboxylation becomes limiting due to " "declining CO$_2$ affinity and increased photorespiration.\n\n" "This 30 °C transition is critical for shading decisions:\n" "- **Below 30 °C**: shading reduces light and hurts photosynthesis (RuBP-limited)\n" "- **Above 30 °C**: shading may help by reducing heat stress on Rubisco" ) if _HAS_PLOTLY: _ac_vals = [] _aj_vals = [] _t_lim = np.arange(15, 46, 0.5) for t in _t_lim: Tk = t + 273.15 Vcmax = _val_model.calc_Vcmax(Tk) Jmax = _val_model.calc_Jmax(Tk) J = _val_model.calc_electron_transport(2000, Jmax) gamma = _val_model.calc_gamma_star(Tk) Kc = _val_model.calc_Kc(Tk) Ko = _val_model.calc_Ko(Tk) ci = _val_model._ci_from_ca(389, 1.5, 0.0) Ac = Vcmax * (ci - gamma) / (ci + Kc * (1 + 210.0 / Ko)) Aj = J * (ci - gamma) / (4 * ci + 8 * gamma) Rd = 0.015 * Vcmax _ac_vals.append(Ac - Rd) _aj_vals.append(Aj - Rd) _lim_fig = go.Figure() _lim_fig.add_trace(go.Scatter( x=list(_t_lim), y=_ac_vals, mode="lines", name="Ac (Rubisco-limited)", line=dict(color="#d62728", dash="dash"), )) _lim_fig.add_trace(go.Scatter( x=list(_t_lim), y=_aj_vals, mode="lines", name="Aj (RuBP-limited)", line=dict(color="#1f77b4", dash="dash"), )) # Actual A = min(Ac, Aj) _a_net = [max(0, min(ac, aj)) for ac, aj in zip(_ac_vals, _aj_vals)] _lim_fig.add_trace(go.Scatter( x=list(_t_lim), y=_a_net, mode="lines", name="A_net = min(Ac, Aj)", line=dict(color="black", width=3), )) _lim_fig.add_vline(x=30, line_dash="dot", line_color="gray", annotation_text="30 °C transition") _lim_fig.update_layout( xaxis_title="Leaf temperature (°C)", yaxis_title="A [µmol CO₂ m⁻² s⁻¹]", height=450, ) st.plotly_chart(_lim_fig) # ---- Section 5: Key findings ---- st.subheader("5. Key findings from validation") st.markdown(""" **Agreement with Greer & Weedon (2012):** - Temperature ranking of Amax matches: 30 °C > 25 °C > 35 °C > 20 °C > 40 °C - RuBP/Rubisco limitation transition occurs at ~30-32 °C (paper: 30 °C) - Vcmax peaks at 39 °C, Jmax peaks at 36 °C (exact match with paper) - Quantitative match within 2-15% at 20-35 °C - Jmax/Vcmax ratio declines from ~2.4 at 20 °C to ~1.1 at 45 °C (paper: 3.0 to 1.5) **Known limitations:** - At 40 °C, model underestimates A by ~12% due to Bernacchi (2001) Rubisco kinetics being parameterised for tobacco, not heat-adapted grapevine - Stomatal response to temperature is modelled via VPD only; the paper shows direct temperature effects on gs (Table 1: 0.199 at 20 °C to 0.140 at 40 °C) - The model uses Ci-based (intercellular CO2) calculations; the paper uses Cc-based (chloroplast CO2) with mesophyll conductance gm = 5-10 µmol m⁻² s⁻¹ Pa⁻¹ **Reference:** Greer, D.H. & Weedon, M.M. (2012) Modelling photosynthetic responses to temperature of grapevine (*Vitis vinifera* cv. Semillon) leaves on vines grown in a hot climate. *Plant, Cell & Environment*, 35, 1050-1064. [DOI: 10.1111/j.1365-3040.2011.02471.x](https://doi.org/10.1111/j.1365-3040.2011.02471.x) """) # --------------------------------------------------------------------------- if _data_section == "Data Explorer": st.header("Data Explorer") st.markdown( "This tab lets you explore the **raw data** behind the predictions. " "Choose a data source below:\n\n" "- **Vineyard sensors** \u2014 Photosynthesis rate **A**, PAR, leaf/air temperature, and \u0394T from on-site crop sensors.\n" "- **Weather station data** \u2014 IMS station 43 (Sde Boker) and the merged dataset used for ML training.\n" "- **AI Data Engineering** \u2014 **Gemini-powered** sensor anomaly detection (Z-score/IQR + physical bounds) and engineered features " "(cyclical time encodings, Stress Risk Score). Run the pipeline and inspect thresholds, cleaning summary, and the daytime stress profile." ) eda_stage = st.radio( "Data source", ["Vineyard sensors", "Weather station data", "AI Data Engineering"], horizontal=True, label_visibility="visible", ) if eda_stage == "Vineyard sensors": st.subheader("Vineyard sensor data") with st.expander("About this data"): st.markdown( "Shows the distribution and temporal patterns of the computed photosynthesis " "rate **A**, plus the raw sensor inputs used to calculate it. " "This helps verify that the model produces physiologically plausible values." ) try: from scripts.eda import get_stage1_eda s1 = get_stage1_eda() except Exception as e: st.error(str(e)) s1 = {"error": str(e)} if s1.get("error"): st.warning(s1["error"]) else: stats = s1["labels_stats"] c1, c2, c3, c4 = st.columns(4) c1.metric("Observations", stats["count"]) c2.metric("Mean A", f"{stats['A_mean']:.2f}") c3.metric("Std A", f"{stats['A_std']:.2f}") c4.metric("Range", f"{stats['A_min']:.1f} \u2013 {stats['A_max']:.1f}") st.caption(f"Date range: {stats['date_min']} to {stats['date_max']}") if _HAS_PLOTLY and s1.get("labels") is not None: A = s1["labels"].iloc[:, 0] with st.expander("About: Distribution of A"): st.markdown( "Histogram of all computed A values. A right-skewed distribution is typical: " "many low-A values (early/late day, cloudy) with a tail of high-A values " "(midday, full sun). The peak should be between 5\u201315 \u00b5mol m\u207b\u00b2 s\u207b\u00b9 for grapevines." ) fig = px.histogram(x=A[A >= 1].dropna(), nbins=50, title="Distribution of A (Stage 1 labels, A \u2265 1)") fig.update_layout(xaxis_title="A (\u00b5mol m\u207b\u00b2 s\u207b\u00b9)", xaxis_range=[1, None]) st.plotly_chart(fig) with st.expander("About: A over time"): st.markdown( "Time series of A across the dataset. Only the **growing season** " "(May\u2013Sep) is included \u2014 the gaps between clusters represent the " "dormant months (Oct\u2013Apr) when the vine does not photosynthesize " "and no data is collected. Within each season, look for diurnal " "oscillations and any anomalous spikes that may indicate sensor issues." ) # Resample to daily mean to compress gaps and smooth diurnal noise A_daily = A.resample("D").mean().dropna() fig2 = go.Figure() fig2.add_trace(go.Scatter( x=A_daily.index, y=A_daily.values, mode="lines", name="A (daily mean)", line=dict(width=1.5, color=_BRAND_GREEN), connectgaps=False, )) fig2.update_layout( title="A over time (daily mean)", xaxis_title="Time", yaxis_title="A (\u00b5mol m\u207b\u00b2 s\u207b\u00b9)", ) st.plotly_chart(fig2) if s1.get("sensor_sample") is not None and not s1["sensor_sample"].empty and _HAS_PLOTLY: df = s1["sensor_sample"] st.subheader("Sensor distributions (daytime PAR > 50)") with st.expander("About sensor distributions"): st.markdown( "Histograms of the main sensor inputs used in the Farquhar model, filtered " "to daytime only (PAR > 50 \u00b5mol m\u207b\u00b2 s\u207b\u00b9).\n\n" "- **PAR:** Light energy for photosynthesis (400\u2013700 nm). " "Values above 2500 are sensor artifacts and are excluded.\n" "- **Leaf Temp:** Leaf surface temperature (\u00b0C).\n" "- **Air Temp:** Ambient temperature near the canopy (\u00b0C)." ) sensor_cols = [c for c in ["Air1_PAR_ref", "Air1_leafTemperature_ref", "Air1_airTemperature_ref"] if c in df.columns] if sensor_cols: cols = st.columns(len(sensor_cols)) for col_st, col_name in zip(cols, sensor_cols): with col_st: series = df[col_name].dropna() # Remove PAR outliers (sensor artifacts above 2500) if col_name == "Air1_PAR_ref": series = series[series <= 2500] fig = px.histogram(series, nbins=40, title=col_name.replace("Air1_", "").replace("_ref", "")) fig.update_layout(height=300) st.plotly_chart(fig) # --- Air-Leaf Temperature Delta --- if "Air1_leafTemperature_ref" in df.columns and "Air1_airTemperature_ref" in df.columns: st.subheader("Leaf\u2013Air temperature difference (\u0394T)") with st.expander("Why is \u0394T important?"): st.markdown( "The difference between leaf and air temperature " "(**\u0394T = T_leaf \u2212 T_air**) is a direct indicator of " "**plant water stress**.\n\n" "- **\u0394T < 0** (leaf cooler than air): the vine is transpiring " "normally \u2014 evaporative cooling keeps the leaf below air " "temperature. The stomata are open and photosynthesis is active.\n" "- **\u0394T \u2248 0**: transpiration is slowing down.\n" "- **\u0394T > 0** (leaf warmer than air): the vine has partially " "or fully closed its stomata due to water stress or extreme VPD. " "Transpiration has stopped cooling the leaf, so it heats up " "above ambient. Photosynthesis is severely limited.\n\n" "This is the basis of the **Crop Water Stress Index (CWSI)** " "used in the Farquhar model. In agrivoltaics, a rising \u0394T is " "the signal that the vine would benefit from tracker shading: " "the extra light cannot be used anyway because the stomata " "are shut." ) delta_t = df["Air1_leafTemperature_ref"] - df["Air1_airTemperature_ref"] delta_t = delta_t.dropna() col_hist, col_time = st.columns(2) with col_hist: fig_dt = px.histogram( delta_t, nbins=50, title="\u0394T distribution (daytime)", color_discrete_sequence=[_BRAND_GREEN], ) fig_dt.update_layout( xaxis_title="\u0394T = T_leaf \u2212 T_air (\u00b0C)", yaxis_title="Count", height=350, ) fig_dt.add_vline(x=0, line_dash="dash", line_color="red", annotation_text="T_leaf = T_air") st.plotly_chart(fig_dt) with col_time: if "time" in df.columns: # Filter to growing season only (May-Sep) _ts = pd.to_datetime(df["time"], utc=True) _grow_mask = _ts.dt.month.isin([5, 6, 7, 8, 9]) _dt_grow = delta_t[_grow_mask] _ts_grow = _ts[_grow_mask] fig_dt2 = go.Figure() fig_dt2.add_trace(go.Scatter( x=_ts_grow, y=_dt_grow.values, mode="markers", marker=dict(size=2, color=_BRAND_GREEN, opacity=0.4), name="\u0394T", )) fig_dt2.add_hline(y=0, line_dash="dash", line_color="red") fig_dt2.update_layout( title="\u0394T over time", xaxis_title="Time", yaxis_title="\u0394T (\u00b0C)", height=350, ) st.plotly_chart(fig_dt2) elif eda_stage == "Weather station data": st.subheader("Weather station data") with st.expander("About this data"): st.markdown( "Shows the IMS weather station data and the merged dataset used for " "prediction model training. This helps verify data overlap, check for " "missing values, and understand the weather patterns." ) try: from scripts.eda import get_stage2_eda s2 = get_stage2_eda() except Exception as e: st.error(str(e)) s2 = {"error": str(e)} if s2.get("error"): st.warning(s2["error"]) else: stats = s2["stats"] c1, c2, c3 = st.columns(3) c1.metric("IMS rows", f"{stats['ims_rows']:,}") c2.metric("Merged rows", f"{stats['merged_rows']:,}") c3.metric("Features", len(stats["feature_cols"])) st.caption(f"IMS range: {stats['ims_date_min']} to {stats['ims_date_max']}") with st.expander("What are the feature columns?"): st.markdown( "IMS weather variables and engineered time features used as " "ML inputs. No on-site sensor data is included (strict separation to avoid leakage).\n\n" "- **air_temperature_c, tdmax_c, tdmin_c:** Temperature from IMS station.\n" "- **ghi_w_m2:** Global Horizontal Irradiance (solar radiation) \u2014 proxy for PAR.\n" "- **rh_percent:** Relative humidity.\n" "- **rain_mm:** Precipitation.\n" "- **wind_speed_ms:** Wind speed.\n" "- **hour_sin, hour_cos:** Cyclical encoding of hour-of-day.\n" "- **doy_sin, doy_cos:** Cyclical encoding of day-of-year (seasonality)." ) merged = s2["merged"] st.dataframe(merged.describe()) if _HAS_PLOTLY and "A" in merged.columns: with st.expander("About: Distribution of A (merged)"): st.markdown( "Distribution of A in the merged IMS+labels dataset. This is the subset " "of Stage 1 labels that have matching IMS timestamps. Compare with Stage 1 " "distribution to check for sampling bias." ) fig = px.histogram(merged["A"][merged["A"] >= 1].dropna(), nbins=50, title="Distribution of A (merged set, A \u2265 1)") fig.update_layout(xaxis_title="A (\u00b5mol m\u207b\u00b2 s\u207b\u00b9)", xaxis_range=[1, None]) st.plotly_chart(fig) if _HAS_PLOTLY and merged is not None: num_cols = [c for c in stats["feature_cols"] if c in merged.columns][:4] if num_cols: with st.expander("About: Feature distributions"): st.markdown( "Histograms of the first four numeric IMS features in the merged dataset. " "Check for: reasonable value ranges, skewness, outliers, and missing-value " "patterns that might affect model training." ) fig = make_subplots(rows=2, cols=2, subplot_titles=num_cols) for i, col in enumerate(num_cols): r, c = i // 2 + 1, i % 2 + 1 fig.add_trace(go.Histogram(x=merged[col].dropna(), nbinsx=30), row=r, col=c) fig.update_layout(title="Feature distributions (merged)") st.plotly_chart(fig) else: # ── AI Data Engineering ────────────────────────────────────────────── st.subheader("AI Data Engineering") st.markdown( "Gemini analyzes each sensor column's statistics against known physical constraints " "for grapevines in the Negev desert, then returns per-column anomaly thresholds " "(hard bounds + Z-score + IQR multiplier). The pipeline also generates five " "engineered features fed directly into the ML prediction models." ) _llm_err = None try: from src.llm_data_engineer import LLMDataEngineer, SENSOR_CONTEXT _loader_path = settings.SENSORS_WIDE_SAMPLE_PATH if not _loader_path.exists(): _loader_path = settings.SENSORS_WIDE_PATH if not _loader_path.exists(): raise FileNotFoundError("Sensor data file not found.") _df_raw = pd.read_csv(_loader_path) _engineer = LLMDataEngineer(verbose=False) _key_cols = list(SENSOR_CONTEXT.keys()) _present = [c for c in _key_cols if c in _df_raw.columns] with st.spinner("Querying Gemini for anomaly thresholds…"): _thresholds = _engineer.analyze_anomalies(_df_raw, columns=_present) _df_clean = _engineer.apply_cleaning(_df_raw, _thresholds, strategy="clip") with st.spinner("Querying Gemini for feature engineering spec…"): _feat_spec = _engineer.get_feature_spec(list(_df_clean.columns)) _df_eng = _engineer.engineer_features(_df_clean, feature_spec=_feat_spec) # Violation counts _viol_before, _viol_after = {}, {} for _col, _t in _thresholds.items(): if _col not in _df_raw.columns: continue _lo, _hi = _t.get("lower_bound"), _t.get("upper_bound") _m = pd.Series(False, index=_df_raw.index) if _lo is not None: _m |= _df_raw[_col] < _lo if _hi is not None: _m |= _df_raw[_col] > _hi _viol_before[_col] = int(_m.sum()) _m2 = pd.Series(False, index=_df_clean.index) if _lo is not None: _m2 |= _df_clean[_col] < _lo if _hi is not None: _m2 |= _df_clean[_col] > _hi _viol_after[_col] = int(_m2.sum()) # Stress profile _df_eng["_hr_local"] = (pd.to_datetime(_df_eng["time"], utc=True).dt.hour + 3) % 24 _daytime = _df_eng[_df_eng["Air1_PAR_ref"] > 50] if "Air1_PAR_ref" in _df_eng.columns else _df_eng _stress_profile = ( _daytime.groupby("_hr_local")["stress_risk_score"].mean() .reindex(range(24), fill_value=float("nan")) ) _used_gemini = "Statistical fallback" not in list(_thresholds.values())[0].get("rationale", "") except Exception as _exc: _llm_err = str(_exc) if _llm_err: st.error(f"Pipeline error: {_llm_err}") else: _source_badge = ( "🤖 Thresholds sourced from **Gemini**" if _used_gemini else "⚙️ Thresholds from **statistical fallback** (set `GOOGLE_API_KEY` to enable Gemini)" ) st.caption(_source_badge) # ── Section 1: Anomaly thresholds ────────────────────────────── st.markdown("#### Gemini anomaly thresholds") with st.expander("How are thresholds generated?"): st.markdown( "For each sensor column, `LLMDataEngineer` sends the full descriptive " "statistics (min, max, percentiles) plus domain context — physical units, " "expected range for the Negev site, known failure modes — to Gemini. " "Gemini returns a JSON with:\n\n" "- **Hard bounds** (`lower_bound` / `upper_bound`): values outside these are " "physically impossible or known sensor faults.\n" "- **Z-score threshold**: flags readings that deviate more than N standard " "deviations from the column mean.\n" "- **IQR multiplier**: flags readings outside Q1 − k·IQR … Q3 + k·IQR.\n\n" "A reading is flagged only when the hard-bound violation OR both the Z-score " "AND IQR conditions are met simultaneously. Default strategy is **clip** " "(clamp to bounds), preserving row count." ) _thresh_rows = [] _col_labels = { "Air1_PAR_ref": "PAR (μmol/m²/s)", "Air1_leafTemperature_ref": "T_leaf (°C)", "Air1_airTemperature_ref": "T_air (°C)", "Air1_VPD_ref": "VPD (kPa)", "Air1_airHumidity_ref": "Humidity (%)", "Air1_CO2_ref": "CO₂ raw (ppm)", } for _col in _present: _t = _thresholds.get(_col, {}) _vb = _viol_before.get(_col, 0) _va = _viol_after.get(_col, 0) _thresh_rows.append({ "Sensor": _col_labels.get(_col, _col), "Lower bound": _t.get("lower_bound", "—"), "Upper bound": _t.get("upper_bound", "—"), "Z-score σ": _t.get("zscore_threshold", "—"), "IQR ×": _t.get("iqr_multiplier", "—"), "Violations (raw)": _vb, "After clip": _va, "Rationale": _t.get("rationale", ""), }) _thresh_df = pd.DataFrame(_thresh_rows) st.dataframe(_thresh_df, hide_index=True) # ── Section 2: Cleaning summary metrics ──────────────────────── st.markdown("#### Cleaning summary") _total_viol = sum(_viol_before.values()) _total_cleared = sum(v for v in _viol_after.values() if v == 0) _pct_retained = len(_df_clean) / len(_df_raw) * 100 _cm1, _cm2, _cm3, _cm4 = st.columns(4) _cm1.metric("Rows in dataset", f"{len(_df_raw):,}") _cm2.metric("Physical violations found", str(_total_viol)) _cm3.metric("Columns fully cleared", f"{_total_cleared} / {len(_viol_before)}") _cm4.metric("Rows retained (clip)", f"{_pct_retained:.1f}%") if _HAS_PLOTLY and _total_viol > 0: _viol_cols = [_col_labels.get(c, c) for c in _viol_before if _viol_before[c] > 0] _viol_vals = [_viol_before[c] for c in _viol_before if _viol_before[c] > 0] _fig_viol = px.bar( x=_viol_cols, y=_viol_vals, labels={"x": "Sensor", "y": "Violation count"}, title="Physical violations by sensor (before cleaning)", color_discrete_sequence=[_BRAND_GREEN], ) _fig_viol.update_layout(height=300) st.plotly_chart(_fig_viol) # ── Section 3: Before / after distributions ──────────────────── if _HAS_PLOTLY: st.markdown("#### Before vs after cleaning — PAR & VPD") with st.expander("What to look for"): st.markdown( "The **raw** histogram (red) includes all sensor readings. " "The **cleaned** histogram (green) shows the same column after " "the Gemini-generated thresholds are applied. Outlier spikes at the " "far right of PAR and VPD should disappear or be clipped to the bound." ) _ba_cols = st.columns(2) for _idx, _col in enumerate(["Air1_PAR_ref", "Air1_VPD_ref"]): if _col not in _df_raw.columns: continue _label = _col_labels.get(_col, _col) _hi_bound = _thresholds.get(_col, {}).get("upper_bound") _raw_s = _df_raw[_col].dropna() _clean_s = _df_clean[_col].dropna() _fig_ba = go.Figure() _fig_ba.add_trace(go.Histogram( x=_raw_s, nbinsx=60, name="Raw", marker_color="crimson", opacity=0.55, )) _fig_ba.add_trace(go.Histogram( x=_clean_s, nbinsx=60, name="Cleaned", marker_color=_BRAND_GREEN, opacity=0.7, )) if _hi_bound is not None: _fig_ba.add_vline( x=_hi_bound, line_dash="dash", line_color="orange", annotation_text=f"bound={_hi_bound}", ) _fig_ba.update_layout( barmode="overlay", title=f"{_label} — raw vs cleaned", xaxis_title=_label, height=320, ) with _ba_cols[_idx]: st.plotly_chart(_fig_ba) # ── Section 4: Engineered features ───────────────────────────── st.markdown("#### Engineered features") with st.expander("How are features engineered?"): st.markdown( "After cleaning, the pipeline asks Gemini to confirm the optimal weights " "and normalisation bounds for the **Stress Risk Score**, given the available " "sensor columns and the Semillon grapevine stress physiology. " "It then computes five new columns:\n\n" "| Feature | Formula | Purpose |\n" "|---|---|---|\n" "| `hour_sin` | sin(2π·h/24) | Cyclical hour-of-day |\n" "| `hour_cos` | cos(2π·h/24) | Cyclical hour-of-day |\n" "| `doy_sin` | sin(2π·d/365) | Seasonal position |\n" "| `doy_cos` | cos(2π·d/365) | Seasonal position |\n" "| `stress_risk_score` | w_VPD·norm(VPD) + w_CWSI·norm(CWSI) | Acute stress in [0, 1] |\n\n" "Cyclical encodings ensure that midnight→01:00 and 23:00→midnight are " "treated as equally close by the model — something a raw hour integer cannot do." ) # Feature spec card _fs_vpd_w = _feat_spec.get("vpd_weight", "—") _fs_cwsi_w = _feat_spec.get("cwsi_weight", "—") _fs_vpd_clip = _feat_spec.get("vpd_clip_max", "—") _fs_rat = _feat_spec.get("rationale", "") _fc1, _fc2, _fc3 = st.columns(3) _fc1.metric("VPD weight", _fs_vpd_w) _fc2.metric("CWSI weight", _fs_cwsi_w) _fc3.metric("VPD clip max (kPa)", _fs_vpd_clip) st.caption(f"Gemini rationale: {_fs_rat}") # Feature stats table _eng_feat_cols = ["hour_sin", "hour_cos", "doy_sin", "doy_cos", "stress_risk_score"] _feat_stats = ( _df_eng[[c for c in _eng_feat_cols if c in _df_eng.columns]] .describe(percentiles=[0.25, 0.5, 0.75]) .loc[["min", "mean", "max"]] .round(4) ) st.dataframe(_feat_stats) # ── Section 5: Daytime stress profile ────────────────────────── if _HAS_PLOTLY and "stress_risk_score" in _df_eng.columns: st.markdown("#### Daytime stress profile") with st.expander("How to read this chart"): st.markdown( "Mean **Stress Risk Score** per local hour (Israel = UTC+3), " "computed over all daytime readings (PAR > 50 μmol m⁻² s⁻¹). " "A score of 1.0 means the vine is under maximum atmospheric demand; " "0.0 means no stress. The midday–afternoon peak is the primary " "window where SolarWine shading interventions are concentrated." ) _profile_df = _stress_profile.dropna().reset_index() _profile_df.columns = ["Hour (local)", "Stress Risk Score"] _peak_hr = int(_profile_df.loc[_profile_df["Stress Risk Score"].idxmax(), "Hour (local)"]) _fig_stress = px.bar( _profile_df, x="Hour (local)", y="Stress Risk Score", color="Stress Risk Score", color_continuous_scale=["#00BD3E", "#f5c518", "#e63946"], range_y=[0, 1], title=f"Hourly stress profile — peak at {_peak_hr:02d}:00 local", ) _fig_stress.add_hline( y=0.5, line_dash="dot", line_color="orange", annotation_text="Intervention threshold (0.5)", ) _fig_stress.update_layout( xaxis=dict(tickmode="linear", dtick=1), coloraxis_showscale=False, height=380, ) st.plotly_chart(_fig_stress) st.caption( f"Peak stress: {_peak_hr:02d}:00 local " f"(score = {_stress_profile.max():.3f}). " f"Low-stress morning window (before 10:00): " f"mean score = {_stress_profile.loc[6:9].mean():.3f} — shading withheld." ) # --------------------------------------------------------------------------- # --------------------------------------------------------------------------- # Tab — Shading Simulator (was Panel vs Open Sky) # ---------------------------------------------------------------------------