1. Data Cleaning and Transformation¶

In [ ]:
import os, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
In [ ]:
# === 0) Colab setup: mount Google Drive ===
from google.colab import drive
drive.mount('/content/drive')

# === 1) Paths
BASE_DIR = "/content/drive/My Drive/Research Project (W&M)/Salvinia_Weevil_Biocontrol_Project"
FILENAME = "salviniadata.xlsx"
XLSX_PATH = os.path.join(BASE_DIR, FILENAME)
Mounted at /content/drive
In [ ]:
# === 2) Read Excel (list sheets, load the main one)

# Show sheet names
xls = pd.ExcelFile(XLSX_PATH)
print("Sheets:", xls.sheet_names)

# Read the primary sheet
df = pd.read_excel(XLSX_PATH, sheet_name=xls.sheet_names[0])
Sheets: ['Sheet1', 'Sheet2', 'Sheet3']
In [ ]:
# === 3) Date parsing (Excel serial dates → pandas datetime)
# Convert Date column
if pd.api.types.is_numeric_dtype(df.get('Date', pd.Series(dtype=float))):
    df['Date'] = pd.to_datetime(df['Date'], unit='D', origin='1899-12-30')

# Enforce useful dtypes for numeric cols
numeric_cols = [
    'Mean # weevils', 'Mean dry wt', 'Total dry wt.', '% Salvinia Cover',
    'Mean blk', 'Mean brn', 'Mean % Salvinia Dam'
]
for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
In [ ]:
# === 4) Basic sanity checks for EDA
# Sort by date (per site if multiple sites exist)
sort_cols = ['Site', 'Date'] if 'Site' in df.columns else ['Date']
df = df.sort_values(sort_cols).reset_index(drop=True)

# Quick look at unique sites and date span
print("Sites:", df['Site'].unique() if 'Site' in df.columns else "N/A")
print("Date range:", df['Date'].min(), "→", df['Date'].max())

# Save a cleaned copy for later steps (SSR, GPR)
CLEAN_CSV = os.path.join(BASE_DIR, "salvinia_clean.csv")
df.to_csv(CLEAN_CSV, index=False)
print("Saved:", CLEAN_CSV)
Sites: ['Island' 'JaJa' 'Jabiluka' 'Minggung']
Date range: 1991-08-28 00:00:00 → 2004-02-21 00:00:00
Saved: /content/drive/My Drive/Research Project (W&M)/Salvinia_Weevil_Biocontrol_Project/salvinia_clean.csv

2. Exploratory Data Analysis¶

In [ ]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, matplotlib.dates as mdates

# -------- config --------
PREFERRED_SITES = ['Jabiluka', 'JaJa', 'Minggung', 'Island']
WINDOW = (pd.Timestamp('1991-01-01'), pd.Timestamp('2003-12-31'))
MAX_GAP_DAYS = 90

# -------- site-specific styles for BIOMASS (dots + lines) --------
SITE_STYLES = {
    'Jabiluka': dict(color='orange', linestyle='-', linewidth=1.5),
    'JaJa':     dict(color='green',  linestyle='-', linewidth=1.5),
    'Minggung': dict(color='black',  linestyle='-', linewidth=1.5),
    'Island':   dict(color='purple', linestyle='-', linewidth=1.5),
}

# -------- prep data (assumes df already loaded) --------
d = df.copy()

# Date -> datetime (robust for Excel serials)
if not pd.api.types.is_datetime64_any_dtype(d['Date']):
    d['Date'] = pd.to_datetime(pd.to_numeric(d['Date'], errors='coerce'),
                               unit='D', origin='1899-12-30')

# numeric biomass
d['Total dry wt.'] = pd.to_numeric(d['Total dry wt.'], errors='coerce')

# find a damage % column heuristically
cand = [c for c in d.columns
        if ('damage' in c.lower()) or ('salvinia dam' in c.lower()) or ('% salvinia dam' in c.lower())]
if not cand:
    raise ValueError("Couldn't find a weevil damage % column (e.g. 'Mean % Salvinia Dam').")
DAM_COL = cand[0]

# essentials + window
d = d[['Site','Date','Total dry wt.', DAM_COL]].dropna(subset=['Site','Date'])
d = d[(d['Date'] >= WINDOW[0]) & (d['Date'] <= WINDOW[1])].copy()

# transforms
d['y_biomass'] = np.log(d['Total dry wt.'].where(d['Total dry wt.'] > 0, np.nan))

# damage: % -> fraction (remove 0 or 1) -> logit
frac = pd.to_numeric(d[DAM_COL], errors='coerce') / 100.0
frac = frac.where((frac > 0) & (frac < 1), np.nan)  # remove 0 and 1
d['y_damage'] = np.log(frac / (1.0 - frac))

# global mean-centering
d['y_biomass'] -= d['y_biomass'].mean(skipna=True)
d['y_damage']  -= d['y_damage'].mean(skipna=True)

# -------- helper: draw lines that break across long gaps --------
def line_with_gaps(ax, dates, values, max_gap_days=MAX_GAP_DAYS, **plot_kwargs):
    df_tmp = pd.DataFrame({'Date': dates, 'y': values}).dropna().sort_values('Date')
    if df_tmp.empty:
        return
    # average same-day duplicates
    df_tmp = df_tmp.groupby('Date', as_index=False)['y'].mean()
    gap = df_tmp['Date'].diff().dt.days.fillna(0)
    run_id = (gap > max_gap_days).cumsum()
    for _, seg in df_tmp.groupby(run_id):
        if len(seg) >= 2:
            ax.plot(seg['Date'], seg['y'], zorder=3, **plot_kwargs)

# -------- plot (2×2) --------
sites = [s for s in PREFERRED_SITES if s in d['Site'].unique()]
if len(sites) < 4:
    sites += [s for s in d['Site'].unique() if s not in sites][:4 - len(sites)]

fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=True, sharey=True)
axes = axes.ravel()

for i, site in enumerate(sites):
    ax = axes[i]
    g = d[d['Site'] == site].copy().sort_values('Date')

    style = SITE_STYLES.get(site, dict(color='black', linestyle='-', linewidth=1.8))

    # raw dots
    ax.scatter(g['Date'], g['y_damage'],  s=22, color='0.7',   zorder=1)          # damage dots (grey)
    ax.scatter(g['Date'], g['y_biomass'], s=22, color=style['color'], zorder=2)   # biomass dots (site color)

    # lines
    line_with_gaps(ax, g['Date'], g['y_damage'],  color='0.7', linewidth=1.5, linestyle='-')   # damage line (grey)
    line_with_gaps(ax, g['Date'], g['y_biomass'], **style)                                     # biomass line (site style)

    ax.set_title(site, fontsize=12)
    ax.grid(True, alpha=0.25)

# hide unused panels if <4 sites
for j in range(len(sites), 4):
    fig.delaxes(axes[j])

# axis cosmetics
for ax in fig.axes:
    ax.xaxis.set_major_locator(mdates.YearLocator(1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
    for t in ax.get_xticklabels():
        t.set_rotation(90); t.set_va('center')

plt.xlim(WINDOW[0], WINDOW[1])
plt.ylim(-9, 7)
fig.suptitle(
    "Dots: raw; Lines: connect-within-gaps (no model)\n"
    "Biomass (site-colored, ln), Damage (grey, logit)",
    fontsize=16
)
plt.tight_layout(rect=[0,0,1,0.93])
plt.show()
No description has been provided for this image

3. Delay Embedding + GPR¶

In [ ]:
# === Takens delay embedding + GPR (manual Q and TAU) ===
# Uses ONLY ln(Total dry wt.) on a uniform time grid; no AMI/FNN.

import numpy as np, pandas as pd, matplotlib.pyplot as plt, matplotlib.dates as mdates
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.metrics import r2_score

# ---------------- CONFIG (edit these) ----------------
SITE = "Island"   # 'Jabiluka', 'JaJa', 'Minggung', 'Island'
Q    = 2          # embedding dimension (1, 2, 3, ...)
TAU  = 1          # delay (in resampled steps, e.g., 1..10)
FREQ = "30D"      # resample interval (uniform grid)
TEST_FRAC = 0.3   # chronological split fraction
H    = 1          # forecast horizon (steps ahead)
NOISE = 1e-4      # tiny GP noise for stability (set 0.0 for exact interpolation—may warn)

# ---------------- HELPERS ----------------
def ensure_datetime_num(df):
    d = df.copy()
    if not pd.api.types.is_datetime64_any_dtype(d['Date']):
        d['Date'] = pd.to_datetime(pd.to_numeric(d['Date'], errors='coerce'),
                                   unit='D', origin='1899-12-30')
    d['Total dry wt.'] = pd.to_numeric(d['Total dry wt.'], errors='coerce')
    return d

def series_ln_biomass_regular(df_site, freq=FREQ):
    """Return (dates, y) as uniform-grid ln(Total dry wt.), with time interpolation."""
    d = ensure_datetime_num(df_site).sort_values('Date').set_index('Date')
    y = np.log(d['Total dry wt.'].where(d['Total dry wt.'] > 0))   # ln; zeros -> NaN
    y = y.resample(freq).mean().interpolate('time')                # uniform grid
    return y.index.to_pydatetime(), y.values

def embed(x, q, tau):
    """Standard delay embedding: rows are [x_t, x_{t-τ}, ..., x_{t-(q-1)τ}]."""
    x = np.asarray(x, float)
    mask = np.isfinite(x); x = x[mask]
    n = len(x) - (q-1)*tau
    if n <= 0:
        return np.empty((0, q)), np.array([], dtype=int)
    X = np.column_stack([x[i:i+n] for i in range(0, q*tau, tau)])
    idx = np.arange((q-1)*tau, (q-1)*tau + n, dtype=int)  # indices aligned to current coord
    return X, idx

def train_predict_gpr(dates, y, q, tau, horizon=H, test_frac=TEST_FRAC, noise=NOISE):
    """Fit GPR for one-step (or H-step) forecasting from embedded state."""
    X, idx = embed(y, q, tau)
    if len(X) == 0:
        raise ValueError(f"Not enough data for Q={q}, TAU={tau}. Need (Q-1)*TAU < series length.")
    target_idx = idx + horizon
    valid = target_idx < len(y)
    X, idx, target_idx = X[valid], idx[valid], target_idx[valid]
    y_target = y[target_idx]
    dates_X  = np.array(dates)[idx]

    # chronological split
    split = int((1 - test_frac) * len(X))
    Xtr, Xte = X[:split], X[split:]
    ytr, yte = y_target[:split], y_target[split:]
    dtr, dte = dates_X[:split], dates_X[split:]

    # GP kernel (RBF + tiny white noise)
    length_guess = max(5.0, 2.0 * tau * q)  # heuristic in "steps"
    kernel = C(1.0, (1e-3, 1e3)) * RBF(length_guess, (1.0, 1e3)) + WhiteKernel(noise_level=noise)
    gp = GaussianProcessRegressor(kernel=kernel, normalize_y=True,
                                  n_restarts_optimizer=3, random_state=0)
    gp.fit(Xtr, ytr)

    yhat_tr = gp.predict(Xtr); yhat_te = gp.predict(Xte)
    r2_tr = r2_score(ytr, yhat_tr) if len(ytr)>1 else np.nan
    r2_te = r2_score(yte, yhat_te) if len(yte)>1 else np.nan
    yhat_full = gp.predict(X)

    return {'gp': gp, 'dates_X': dates_X, 'y': y_target, 'yhat_full': yhat_full,
            'split': split, 'r2_tr': r2_tr, 'r2_te': r2_te}

# ---------------- RUN ----------------
print("Sites in df:", sorted(df['Site'].dropna().unique().tolist()))
site_df = df[df['Site'] == SITE].copy()
dates, y_meas = series_ln_biomass_regular(site_df, freq=FREQ)

# sanity check for embedding feasibility
eff_len = len(y_meas)
need = (Q-1)*TAU + 1
if need >= eff_len:
    raise ValueError(f"Series too short for Q={Q}, TAU={TAU} (need ≤ {(eff_len-1)//max(1,TAU)+1} effective q).")

print(f"Running Takens+GPR on {SITE} with Q={Q}, TAU={TAU}, horizon H={H}")
res = train_predict_gpr(dates, y_meas, q=Q, tau=TAU, horizon=H, test_frac=TEST_FRAC, noise=NOISE)
print(f"R² train = {res['r2_tr']:.3f} | R² test = {res['r2_te']:.3f}")
print("Fitted kernel:", res['gp'].kernel_)

# ---------------- PLOT ----------------
fig, ax = plt.subplots(figsize=(12,5))
ax.plot(dates, y_meas, color='0.7', label='ln(Total dry wt.) (uniform grid)')
ax.plot(res['dates_X'], res['yhat_full'], color='black', label=f'GPR one-step (Q={Q}, τ={TAU})')
if 0 < res['split'] < len(res['dates_X']):
    ax.axvline(res['dates_X'][res['split']], color='red', linestyle='--', alpha=0.4, label='train/test split')
ax.set_title(f"{SITE} — Takens (Q={Q}, τ={TAU}) + GPR (H={H})")
ax.set_xlabel("Year"); ax.set_ylabel("ln biomass")
ax.xaxis.set_major_locator(mdates.YearLocator(1)); ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
for t in ax.get_xticklabels(): t.set_rotation(90); t.set_va('center')
ax.grid(alpha=0.25); ax.legend(frameon=False); plt.tight_layout(); plt.show()
Sites in df: ['Island', 'JaJa', 'Jabiluka', 'Minggung']
Running Takens+GPR on Island with Q=2, TAU=1, horizon H=1
R² train = 0.789 | R² test = 0.698
Fitted kernel: 0.932**2 * RBF(length_scale=4.7) + WhiteKernel(noise_level=0.228)
No description has been provided for this image

4. Plot Actual vs Prediction¶

In [ ]:
# ==== side-by-side embeddings (Actual vs GPR Predicted) ====
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

def _embed(y, q, tau):
    y = np.asarray(y, float)
    y = y[np.isfinite(y)]
    n = len(y) - (q - 1) * tau
    if n <= 0:
        raise ValueError(f"Not enough samples for Q={q}, TAU={tau}")
    return np.column_stack([y[i:i+n] for i in range(0, q*tau, tau)])

def _target_aligned_pred(dates, y_series, q, tau, H, yhat_full):
    """
    Align ŷ to the *target* timestamps used in training (indices t+H).
    Returns (dates_pred, y_pred_clean).
    """
    y = np.asarray(y_series, float)
    y = y[np.isfinite(y)]
    n = len(y) - (q - 1) * tau
    if n <= 0:
        return np.array([]), np.array([])
    # indices for states and their targets
    idx = np.arange((q - 1) * tau, (q - 1) * tau + n, dtype=int)
    tgt = idx + H
    valid = tgt < len(y)
    idx, tgt = idx[valid], tgt[valid]

    # yhat_full is aligned to rows of X (i.e., idx after the same validity filter)
    yhat_full = np.asarray(yhat_full, float)
    L = min(len(yhat_full), len(idx))
    if L == 0:
        return np.array([]), np.array([])
    tgt = tgt[:L]
    yhat_full = yhat_full[:L]

    d_arr = np.array(dates)
    d_tgt = d_arr[tgt]
    good = np.isfinite(yhat_full)
    return d_tgt[good], yhat_full[good]

def _plot_single(ax, X, Q, TAU, title, show_time_color=True, show_path=True):
    tgrad = np.linspace(0, 1, len(X))
    if Q >= 3:
        x1, x2, x3 = X[:,0], X[:,1], X[:,2]
        sc = ax.scatter(x1, x2, x3, c=tgrad if show_time_color else None,
                        cmap='viridis' if show_time_color else None, s=14, alpha=0.9)
        if show_time_color:
            cb = plt.colorbar(sc, ax=ax, pad=0.1, shrink=0.8); cb.set_label("time →")
        if show_path: ax.plot(x1, x2, x3, color='0.6', linewidth=0.8, alpha=0.7)
        ax.set_xlabel("z(t)"); ax.set_ylabel(f"z(t−{TAU})"); ax.set_zlabel(f"z(t−{2*TAU})")
    else:
        x1, x2 = X[:,0], X[:,1]
        sc = ax.scatter(x1, x2, c=tgrad if show_time_color else None,
                        cmap='viridis' if show_time_color else None, s=18, alpha=0.95)
        if show_time_color:
            cb = plt.colorbar(sc, ax=ax); cb.set_label("time →")
        if show_path: ax.plot(x1, x2, color='0.6', linewidth=0.9, alpha=0.7)
        ax.set_xlabel("z(t)"); ax.set_ylabel(f"z(t−{TAU})"); ax.grid(alpha=0.25)
    ax.set_title(title)

def plot_state_space_side_by_side(dates, y_actual, res, Q, TAU, H=1,
                                  show_time_color=True, show_path=True):
    # Build actual embedding
    X_act = _embed(y_actual, Q, TAU)

    # Build target-aligned predicted series, then embed
    dates_pred, y_pred = _target_aligned_pred(dates, y_actual, Q, TAU, H, res['yhat_full'])
    if len(y_pred) == 0:
        raise ValueError("No predicted samples available after alignment.")
    X_pred = _embed(y_pred, Q, TAU)

    # Figure & axes
    if Q >= 3:
        fig = plt.figure(figsize=(14, 6))
        ax1 = fig.add_subplot(1, 2, 1, projection='3d')
        ax2 = fig.add_subplot(1, 2, 2, projection='3d')
    else:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    _plot_single(ax1, X_act,  Q, TAU, f"Actual (Q={Q}, τ={TAU})", show_time_color, show_path)
    _plot_single(ax2, X_pred, Q, TAU, f"Predicted (Q={Q}, τ={TAU})", show_time_color, show_path)
    plt.tight_layout(); plt.show()


# If “actual” series variable is y_meas (or y_ln), pass that here:
plot_state_space_side_by_side(
    dates=dates,
    y_actual=y_meas,   # or y_ln if that's what you use elsewhere
    res=res,           # from your GPR training step
    Q=Q,
    TAU=TAU,
    H=H                # typically 1 in your notebook
)
No description has been provided for this image