1. Data Cleaning and Transformation¶
In [ ]:
import os, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
In [ ]:
# === 0) Colab setup: mount Google Drive ===
from google.colab import drive
drive.mount('/content/drive')
# === 1) Paths
BASE_DIR = "/content/drive/My Drive/Research Project (W&M)/Salvinia_Weevil_Biocontrol_Project"
FILENAME = "salviniadata.xlsx"
XLSX_PATH = os.path.join(BASE_DIR, FILENAME)
Mounted at /content/drive
In [ ]:
# === 2) Read Excel (list sheets, load the main one)
# Show sheet names
xls = pd.ExcelFile(XLSX_PATH)
print("Sheets:", xls.sheet_names)
# Read the primary sheet
df = pd.read_excel(XLSX_PATH, sheet_name=xls.sheet_names[0])
Sheets: ['Sheet1', 'Sheet2', 'Sheet3']
In [ ]:
# === 3) Date parsing (Excel serial dates → pandas datetime)
# Convert Date column
if pd.api.types.is_numeric_dtype(df.get('Date', pd.Series(dtype=float))):
df['Date'] = pd.to_datetime(df['Date'], unit='D', origin='1899-12-30')
# Enforce useful dtypes for numeric cols
numeric_cols = [
'Mean # weevils', 'Mean dry wt', 'Total dry wt.', '% Salvinia Cover',
'Mean blk', 'Mean brn', 'Mean % Salvinia Dam'
]
for c in numeric_cols:
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors='coerce')
In [ ]:
# === 4) Basic sanity checks for EDA
# Sort by date (per site if multiple sites exist)
sort_cols = ['Site', 'Date'] if 'Site' in df.columns else ['Date']
df = df.sort_values(sort_cols).reset_index(drop=True)
# Quick look at unique sites and date span
print("Sites:", df['Site'].unique() if 'Site' in df.columns else "N/A")
print("Date range:", df['Date'].min(), "→", df['Date'].max())
# Save a cleaned copy for later steps (SSR, GPR)
CLEAN_CSV = os.path.join(BASE_DIR, "salvinia_clean.csv")
df.to_csv(CLEAN_CSV, index=False)
print("Saved:", CLEAN_CSV)
Sites: ['Island' 'JaJa' 'Jabiluka' 'Minggung'] Date range: 1991-08-28 00:00:00 → 2004-02-21 00:00:00 Saved: /content/drive/My Drive/Research Project (W&M)/Salvinia_Weevil_Biocontrol_Project/salvinia_clean.csv
2. Exploratory Data Analysis¶
In [ ]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, matplotlib.dates as mdates
# -------- config --------
PREFERRED_SITES = ['Jabiluka', 'JaJa', 'Minggung', 'Island']
WINDOW = (pd.Timestamp('1991-01-01'), pd.Timestamp('2003-12-31'))
MAX_GAP_DAYS = 90
# -------- site-specific styles for BIOMASS (dots + lines) --------
SITE_STYLES = {
'Jabiluka': dict(color='orange', linestyle='-', linewidth=1.5),
'JaJa': dict(color='green', linestyle='-', linewidth=1.5),
'Minggung': dict(color='black', linestyle='-', linewidth=1.5),
'Island': dict(color='purple', linestyle='-', linewidth=1.5),
}
# -------- prep data (assumes df already loaded) --------
d = df.copy()
# Date -> datetime (robust for Excel serials)
if not pd.api.types.is_datetime64_any_dtype(d['Date']):
d['Date'] = pd.to_datetime(pd.to_numeric(d['Date'], errors='coerce'),
unit='D', origin='1899-12-30')
# numeric biomass
d['Total dry wt.'] = pd.to_numeric(d['Total dry wt.'], errors='coerce')
# find a damage % column heuristically
cand = [c for c in d.columns
if ('damage' in c.lower()) or ('salvinia dam' in c.lower()) or ('% salvinia dam' in c.lower())]
if not cand:
raise ValueError("Couldn't find a weevil damage % column (e.g. 'Mean % Salvinia Dam').")
DAM_COL = cand[0]
# essentials + window
d = d[['Site','Date','Total dry wt.', DAM_COL]].dropna(subset=['Site','Date'])
d = d[(d['Date'] >= WINDOW[0]) & (d['Date'] <= WINDOW[1])].copy()
# transforms
d['y_biomass'] = np.log(d['Total dry wt.'].where(d['Total dry wt.'] > 0, np.nan))
# damage: % -> fraction (remove 0 or 1) -> logit
frac = pd.to_numeric(d[DAM_COL], errors='coerce') / 100.0
frac = frac.where((frac > 0) & (frac < 1), np.nan) # remove 0 and 1
d['y_damage'] = np.log(frac / (1.0 - frac))
# global mean-centering
d['y_biomass'] -= d['y_biomass'].mean(skipna=True)
d['y_damage'] -= d['y_damage'].mean(skipna=True)
# -------- helper: draw lines that break across long gaps --------
def line_with_gaps(ax, dates, values, max_gap_days=MAX_GAP_DAYS, **plot_kwargs):
df_tmp = pd.DataFrame({'Date': dates, 'y': values}).dropna().sort_values('Date')
if df_tmp.empty:
return
# average same-day duplicates
df_tmp = df_tmp.groupby('Date', as_index=False)['y'].mean()
gap = df_tmp['Date'].diff().dt.days.fillna(0)
run_id = (gap > max_gap_days).cumsum()
for _, seg in df_tmp.groupby(run_id):
if len(seg) >= 2:
ax.plot(seg['Date'], seg['y'], zorder=3, **plot_kwargs)
# -------- plot (2×2) --------
sites = [s for s in PREFERRED_SITES if s in d['Site'].unique()]
if len(sites) < 4:
sites += [s for s in d['Site'].unique() if s not in sites][:4 - len(sites)]
fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=True, sharey=True)
axes = axes.ravel()
for i, site in enumerate(sites):
ax = axes[i]
g = d[d['Site'] == site].copy().sort_values('Date')
style = SITE_STYLES.get(site, dict(color='black', linestyle='-', linewidth=1.8))
# raw dots
ax.scatter(g['Date'], g['y_damage'], s=22, color='0.7', zorder=1) # damage dots (grey)
ax.scatter(g['Date'], g['y_biomass'], s=22, color=style['color'], zorder=2) # biomass dots (site color)
# lines
line_with_gaps(ax, g['Date'], g['y_damage'], color='0.7', linewidth=1.5, linestyle='-') # damage line (grey)
line_with_gaps(ax, g['Date'], g['y_biomass'], **style) # biomass line (site style)
ax.set_title(site, fontsize=12)
ax.grid(True, alpha=0.25)
# hide unused panels if <4 sites
for j in range(len(sites), 4):
fig.delaxes(axes[j])
# axis cosmetics
for ax in fig.axes:
ax.xaxis.set_major_locator(mdates.YearLocator(1))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
for t in ax.get_xticklabels():
t.set_rotation(90); t.set_va('center')
plt.xlim(WINDOW[0], WINDOW[1])
plt.ylim(-9, 7)
fig.suptitle(
"Dots: raw; Lines: connect-within-gaps (no model)\n"
"Biomass (site-colored, ln), Damage (grey, logit)",
fontsize=16
)
plt.tight_layout(rect=[0,0,1,0.93])
plt.show()
3. Delay Embedding + GPR¶
In [ ]:
# === Takens delay embedding + GPR (manual Q and TAU) ===
# Uses ONLY ln(Total dry wt.) on a uniform time grid; no AMI/FNN.
import numpy as np, pandas as pd, matplotlib.pyplot as plt, matplotlib.dates as mdates
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.metrics import r2_score
# ---------------- CONFIG (edit these) ----------------
SITE = "Island" # 'Jabiluka', 'JaJa', 'Minggung', 'Island'
Q = 2 # embedding dimension (1, 2, 3, ...)
TAU = 1 # delay (in resampled steps, e.g., 1..10)
FREQ = "30D" # resample interval (uniform grid)
TEST_FRAC = 0.3 # chronological split fraction
H = 1 # forecast horizon (steps ahead)
NOISE = 1e-4 # tiny GP noise for stability (set 0.0 for exact interpolation—may warn)
# ---------------- HELPERS ----------------
def ensure_datetime_num(df):
d = df.copy()
if not pd.api.types.is_datetime64_any_dtype(d['Date']):
d['Date'] = pd.to_datetime(pd.to_numeric(d['Date'], errors='coerce'),
unit='D', origin='1899-12-30')
d['Total dry wt.'] = pd.to_numeric(d['Total dry wt.'], errors='coerce')
return d
def series_ln_biomass_regular(df_site, freq=FREQ):
"""Return (dates, y) as uniform-grid ln(Total dry wt.), with time interpolation."""
d = ensure_datetime_num(df_site).sort_values('Date').set_index('Date')
y = np.log(d['Total dry wt.'].where(d['Total dry wt.'] > 0)) # ln; zeros -> NaN
y = y.resample(freq).mean().interpolate('time') # uniform grid
return y.index.to_pydatetime(), y.values
def embed(x, q, tau):
"""Standard delay embedding: rows are [x_t, x_{t-τ}, ..., x_{t-(q-1)τ}]."""
x = np.asarray(x, float)
mask = np.isfinite(x); x = x[mask]
n = len(x) - (q-1)*tau
if n <= 0:
return np.empty((0, q)), np.array([], dtype=int)
X = np.column_stack([x[i:i+n] for i in range(0, q*tau, tau)])
idx = np.arange((q-1)*tau, (q-1)*tau + n, dtype=int) # indices aligned to current coord
return X, idx
def train_predict_gpr(dates, y, q, tau, horizon=H, test_frac=TEST_FRAC, noise=NOISE):
"""Fit GPR for one-step (or H-step) forecasting from embedded state."""
X, idx = embed(y, q, tau)
if len(X) == 0:
raise ValueError(f"Not enough data for Q={q}, TAU={tau}. Need (Q-1)*TAU < series length.")
target_idx = idx + horizon
valid = target_idx < len(y)
X, idx, target_idx = X[valid], idx[valid], target_idx[valid]
y_target = y[target_idx]
dates_X = np.array(dates)[idx]
# chronological split
split = int((1 - test_frac) * len(X))
Xtr, Xte = X[:split], X[split:]
ytr, yte = y_target[:split], y_target[split:]
dtr, dte = dates_X[:split], dates_X[split:]
# GP kernel (RBF + tiny white noise)
length_guess = max(5.0, 2.0 * tau * q) # heuristic in "steps"
kernel = C(1.0, (1e-3, 1e3)) * RBF(length_guess, (1.0, 1e3)) + WhiteKernel(noise_level=noise)
gp = GaussianProcessRegressor(kernel=kernel, normalize_y=True,
n_restarts_optimizer=3, random_state=0)
gp.fit(Xtr, ytr)
yhat_tr = gp.predict(Xtr); yhat_te = gp.predict(Xte)
r2_tr = r2_score(ytr, yhat_tr) if len(ytr)>1 else np.nan
r2_te = r2_score(yte, yhat_te) if len(yte)>1 else np.nan
yhat_full = gp.predict(X)
return {'gp': gp, 'dates_X': dates_X, 'y': y_target, 'yhat_full': yhat_full,
'split': split, 'r2_tr': r2_tr, 'r2_te': r2_te}
# ---------------- RUN ----------------
print("Sites in df:", sorted(df['Site'].dropna().unique().tolist()))
site_df = df[df['Site'] == SITE].copy()
dates, y_meas = series_ln_biomass_regular(site_df, freq=FREQ)
# sanity check for embedding feasibility
eff_len = len(y_meas)
need = (Q-1)*TAU + 1
if need >= eff_len:
raise ValueError(f"Series too short for Q={Q}, TAU={TAU} (need ≤ {(eff_len-1)//max(1,TAU)+1} effective q).")
print(f"Running Takens+GPR on {SITE} with Q={Q}, TAU={TAU}, horizon H={H}")
res = train_predict_gpr(dates, y_meas, q=Q, tau=TAU, horizon=H, test_frac=TEST_FRAC, noise=NOISE)
print(f"R² train = {res['r2_tr']:.3f} | R² test = {res['r2_te']:.3f}")
print("Fitted kernel:", res['gp'].kernel_)
# ---------------- PLOT ----------------
fig, ax = plt.subplots(figsize=(12,5))
ax.plot(dates, y_meas, color='0.7', label='ln(Total dry wt.) (uniform grid)')
ax.plot(res['dates_X'], res['yhat_full'], color='black', label=f'GPR one-step (Q={Q}, τ={TAU})')
if 0 < res['split'] < len(res['dates_X']):
ax.axvline(res['dates_X'][res['split']], color='red', linestyle='--', alpha=0.4, label='train/test split')
ax.set_title(f"{SITE} — Takens (Q={Q}, τ={TAU}) + GPR (H={H})")
ax.set_xlabel("Year"); ax.set_ylabel("ln biomass")
ax.xaxis.set_major_locator(mdates.YearLocator(1)); ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
for t in ax.get_xticklabels(): t.set_rotation(90); t.set_va('center')
ax.grid(alpha=0.25); ax.legend(frameon=False); plt.tight_layout(); plt.show()
Sites in df: ['Island', 'JaJa', 'Jabiluka', 'Minggung'] Running Takens+GPR on Island with Q=2, TAU=1, horizon H=1 R² train = 0.789 | R² test = 0.698 Fitted kernel: 0.932**2 * RBF(length_scale=4.7) + WhiteKernel(noise_level=0.228)
4. Plot Actual vs Prediction¶
In [ ]:
# ==== side-by-side embeddings (Actual vs GPR Predicted) ====
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # noqa: F401
def _embed(y, q, tau):
y = np.asarray(y, float)
y = y[np.isfinite(y)]
n = len(y) - (q - 1) * tau
if n <= 0:
raise ValueError(f"Not enough samples for Q={q}, TAU={tau}")
return np.column_stack([y[i:i+n] for i in range(0, q*tau, tau)])
def _target_aligned_pred(dates, y_series, q, tau, H, yhat_full):
"""
Align ŷ to the *target* timestamps used in training (indices t+H).
Returns (dates_pred, y_pred_clean).
"""
y = np.asarray(y_series, float)
y = y[np.isfinite(y)]
n = len(y) - (q - 1) * tau
if n <= 0:
return np.array([]), np.array([])
# indices for states and their targets
idx = np.arange((q - 1) * tau, (q - 1) * tau + n, dtype=int)
tgt = idx + H
valid = tgt < len(y)
idx, tgt = idx[valid], tgt[valid]
# yhat_full is aligned to rows of X (i.e., idx after the same validity filter)
yhat_full = np.asarray(yhat_full, float)
L = min(len(yhat_full), len(idx))
if L == 0:
return np.array([]), np.array([])
tgt = tgt[:L]
yhat_full = yhat_full[:L]
d_arr = np.array(dates)
d_tgt = d_arr[tgt]
good = np.isfinite(yhat_full)
return d_tgt[good], yhat_full[good]
def _plot_single(ax, X, Q, TAU, title, show_time_color=True, show_path=True):
tgrad = np.linspace(0, 1, len(X))
if Q >= 3:
x1, x2, x3 = X[:,0], X[:,1], X[:,2]
sc = ax.scatter(x1, x2, x3, c=tgrad if show_time_color else None,
cmap='viridis' if show_time_color else None, s=14, alpha=0.9)
if show_time_color:
cb = plt.colorbar(sc, ax=ax, pad=0.1, shrink=0.8); cb.set_label("time →")
if show_path: ax.plot(x1, x2, x3, color='0.6', linewidth=0.8, alpha=0.7)
ax.set_xlabel("z(t)"); ax.set_ylabel(f"z(t−{TAU})"); ax.set_zlabel(f"z(t−{2*TAU})")
else:
x1, x2 = X[:,0], X[:,1]
sc = ax.scatter(x1, x2, c=tgrad if show_time_color else None,
cmap='viridis' if show_time_color else None, s=18, alpha=0.95)
if show_time_color:
cb = plt.colorbar(sc, ax=ax); cb.set_label("time →")
if show_path: ax.plot(x1, x2, color='0.6', linewidth=0.9, alpha=0.7)
ax.set_xlabel("z(t)"); ax.set_ylabel(f"z(t−{TAU})"); ax.grid(alpha=0.25)
ax.set_title(title)
def plot_state_space_side_by_side(dates, y_actual, res, Q, TAU, H=1,
show_time_color=True, show_path=True):
# Build actual embedding
X_act = _embed(y_actual, Q, TAU)
# Build target-aligned predicted series, then embed
dates_pred, y_pred = _target_aligned_pred(dates, y_actual, Q, TAU, H, res['yhat_full'])
if len(y_pred) == 0:
raise ValueError("No predicted samples available after alignment.")
X_pred = _embed(y_pred, Q, TAU)
# Figure & axes
if Q >= 3:
fig = plt.figure(figsize=(14, 6))
ax1 = fig.add_subplot(1, 2, 1, projection='3d')
ax2 = fig.add_subplot(1, 2, 2, projection='3d')
else:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
_plot_single(ax1, X_act, Q, TAU, f"Actual (Q={Q}, τ={TAU})", show_time_color, show_path)
_plot_single(ax2, X_pred, Q, TAU, f"Predicted (Q={Q}, τ={TAU})", show_time_color, show_path)
plt.tight_layout(); plt.show()
# If “actual” series variable is y_meas (or y_ln), pass that here:
plot_state_space_side_by_side(
dates=dates,
y_actual=y_meas, # or y_ln if that's what you use elsewhere
res=res, # from your GPR training step
Q=Q,
TAU=TAU,
H=H # typically 1 in your notebook
)