giovedì 2 aprile 2026

Pretrattamento iperspettrale

 ...ovvero tutto cio' che avrei voluto sapere in dottorato ma nessuno mi ha mai detto.

La domanda e': quanto conta il preprocessamento delle firme spettrali in funzione della successiva regressione? Tanto...di seguito un esempio

Partiamo da questo database di firme spettrali (1350-2500 nm) ed associate caratteristiche chimico fisiche dei suoli 
 

Sanderman, J., Smith, C., Safanelli, J. L., Mitu, S. M., Ge, Y., Murad, O., & Shepherd, K. (2023). Near-infrared (NIR) soil spectral library using the NeoSpectra Handheld NIR Analyzer by Si-Ware (1.0) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7586622 

 

Dal file Neospectra_WoodwellKSSL_avg_soil+site+NIR.csv ho estrapolata solo la colonna EC ed i valori di riflettanza NIR e proviamo senza pretrattamento ad effettuare un regressione Random Forest (2969 spettri di dataset)

==============================
Model Performance (Target: ec)
R² Score: 0.2661
RMSE:     4.0410
==============================
 

import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# --- Settings ---
LABEL_FILE = 'label.csv'
SPECTRA_FOLDER = 'spectra'
ID_COL = 'id' # The column in label.csv
TARGET_COL = 'ec' # The lab value column

def load_data():
df_labels = pd.read_csv(LABEL_FILE)
df_labels = df_labels.dropna(subset=[ID_COL, TARGET_COL])
df_labels[ID_COL] = df_labels[ID_COL].astype(int).astype(str)
X_list = []
y_list = []
print(f"Searching for spectra in: {os.path.abspath(SPECTRA_FOLDER)}")

for _, row in df_labels.iterrows():
sample_id = row[ID_COL]
# Construct filename: e.g., '3748.csv'
file_path = os.path.join(SPECTRA_FOLDER, f"{sample_id}.csv")
if os.path.exists(file_path):
try:
# 2. Load spectrum and ensure wavelength alignment
spec_df = pd.read_csv(file_path)
spec_df = spec_df.sort_values('wavelength', ascending=False)
X_list.append(spec_df['reflectance'].values)
y_list.append(row[TARGET_COL])
except Exception as e:
print(f"Error reading {sample_id}.csv: {e}")
else:
print(f"File not found: {file_path}")

return np.array(X_list), np.array(y_list)

# --- Execution ---
X, y = load_data()

if X.size > 0:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestRegressor(n_estimators=500, max_features='sqrt', n_jobs=-1, random_state=42)
print(f"\nTraining on {X_train.shape[0]} samples...")
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n" + "="*30)
print(f"Model Performance (Target: {TARGET_COL})")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print("="*30)
else:
print("\nNo data loaded. Check if the 'spectra' folder is in the same directory as this script.")


adesso stesso dataset ma con preprocessing cambiamento da riflettanza in assorbanza, SNV + Savitky Gorlay derivata prima ...a parita' di input il modello e' 

Training RF with 1st Derivative + SNV...

==============================
Derivative Results
R² Score: 0.6482
RMSE:     2.7978
==============================

import pandas as pd
import numpy as np
import os
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# --- Settings ---
LABEL_FILE = 'label.csv'
SPECTRA_FOLDER = 'spectra'
ID_COL = 'id'
TARGET_COL = 'ec'

def load_and_preprocess_advanced():
df_labels = pd.read_csv(LABEL_FILE).dropna(subset=[ID_COL, TARGET_COL])
df_labels[ID_COL] = df_labels[ID_COL].astype(int).astype(str)
X_list, y_list = [], []
for _, row in df_labels.iterrows():
file_path = os.path.join(SPECTRA_FOLDER, f"{row[ID_COL]}.csv")
if os.path.exists(file_path):
spec_df = pd.read_csv(file_path).sort_values('wavelength', ascending=False)
# 1. Absorbance
ref = np.clip(spec_df['reflectance'].values, 0.01, 100)
absorbance = np.log10(100.0 / ref)
X_list.append(absorbance)
y_list.append(row[TARGET_COL])

X = np.array(X_list)

# 2. Savitzky-Golay 1st Derivative
# window_length: must be odd. 11-15 is usually good for NIR.
# polyorder: usually 2.
X_deriv = savgol_filter(X, window_length=11, polyorder=2, deriv=1)

# 3. SNV (applied to the derivative)
X_final = (X_deriv - np.mean(X_deriv, axis=1, keepdims=True)) / np.std(X_deriv, axis=1, keepdims=True)
return X_final, np.array(y_list)

# --- Execution ---
X, y = load_and_preprocess_advanced()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use the "Best" RF setup from your 0.42 run
rf = RandomForestRegressor(n_estimators=500, max_features='sqrt', n_jobs=-1, random_state=42)

print("Training RF with 1st Derivative + SNV...")
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("\n" + "="*30)
print(f"Derivative Results")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print("="*30)


Facciamo ancora un ultimo passo applicando la PCA (analisi delle componenti principali) al preprocessing

Training RF su componenti PCA...

==============================
PCA + Random Forest Results
R² Score: 0.7491
RMSE:     2.3630
==============================
 

import pandas as pd
import numpy as np
import os
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# --- Settings ---
LABEL_FILE = 'label.csv'
SPECTRA_FOLDER = 'spectra'
ID_COL = 'id'
TARGET_COL = 'ec'

def load_and_preprocess_advanced():
df_labels = pd.read_csv(LABEL_FILE).dropna(subset=[ID_COL, TARGET_COL])
df_labels[ID_COL] = df_labels[ID_COL].astype(int).astype(str)
X_list, y_list = [], []
for _, row in df_labels.iterrows():
file_path = os.path.join(SPECTRA_FOLDER, f"{row[ID_COL]}.csv")
if os.path.exists(file_path):
spec_df = pd.read_csv(file_path).sort_values('wavelength', ascending=False)
ref = np.clip(spec_df['reflectance'].values, 0.01, 100)
absorbance = np.log10(100.0 / ref)
X_list.append(absorbance)
y_list.append(row[TARGET_COL])

X = np.array(X_list)
# Pre-processing classico (Derivata + SNV)
X_deriv = savgol_filter(X, window_length=11, polyorder=2, deriv=1)
X_snv = (X_deriv - np.mean(X_deriv, axis=1, keepdims=True)) / np.std(X_deriv, axis=1, keepdims=True)
return X_snv, np.array(y_list)

# --- 1. Caricamento ---
X, y = load_and_preprocess_advanced()
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 2. PCA (Principal Component Analysis) ---
# n_components=0.99 significa che manteniamo abbastanza componenti per spiegare il 99% della varianza
pca = PCA(n_components=0.99, random_state=42)

# È fondamentale fittare la PCA solo sul train e trasformare il test
X_train_pca = pca.fit_transform(X_train_raw)
X_test_pca = pca.transform(X_test_raw)

print(f"Numero di Componenti Principali selezionate: {pca.n_components_}")
print(f"Varianza totale spiegata: {np.sum(pca.explained_variance_ratio_):.4f}")

# --- 3. Random Forest sulle Componenti Principali ---
# Con meno feature (PCA), possiamo spesso usare parametri più aggressivi
rf = RandomForestRegressor(n_estimators=500, max_features=None, n_jobs=-1, random_state=42)

print("Training RF su componenti PCA...")
rf.fit(X_train_pca, y_train)

# --- 4. Risultati ---
y_pred = rf.predict(X_test_pca)

print("\n" + "="*30)
print(f"PCA + Random Forest Results")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print("="*30)

 

in sintesi siamo partiti da R2= 0.27 a R2=0.75 solo di preprocessing 

Messo a posto il discorso preprocessing vediamo cosa succede a cambiare modello. Applicando PLSR si ha

Best number of components: 25 (CV R²: 0.2882)

==============================
Final PLS Model Results (n=25)
R² Score: 0.2732
RMSE:     4.0215
==============================
 

il drastico peggioramento dovrebbe essere dovuto al fatto della non linerita' della risposta spettrale nei confronti del parametro analizzato (PLSR lavora su una ipotesi di base di linearita')

facciamo un'ultimo tentativo con Boost Gradient

Riduzione dimensionalità: da 257 a 41 componenti PCA.
Training GBR su componenti PCA...

==============================
GBR + PCA Test Results:
R² Score: 0.7200
RMSE:     2.4963
==============================
 

import pandas as pd
import numpy as np
import os
from scipy.signal import savgol_filter
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# --- Settings ---
TRAIN_LABELS = 'label.csv'
TRAIN_DIR = 'spectra'
VAL_LABELS = 'valida.csv'
VAL_DIR = 'new_spectra'
ID_COL = 'id'
TARGET_COL = 'ec'

def preprocess_spectra(folder, id_list):
"""Absorbance -> SG Derivative -> SNV"""
features = []
processed_ids = []
for sid in id_list:
# Convert ID to string to avoid .0 issues
clean_id = str(int(float(sid)))
path = os.path.join(folder, f"{clean_id}.csv")
if os.path.exists(path):
spec_df = pd.read_csv(path).sort_values('wavelength', ascending=False)
# 1. Absorbance
ref = np.clip(spec_df['reflectance'].values, 0.01, 100)
absorbance = np.log10(100.0 / ref)
features.append(absorbance)
processed_ids.append(sid)
X = np.array(features)
# 2. 1st Derivative (Smooths and highlights peaks)
X = savgol_filter(X, window_length=11, polyorder=2, deriv=1)
# 3. SNV (Standardizes the scale)
X = (X - np.mean(X, axis=1, keepdims=True)) / np.std(X, axis=1, keepdims=True)
return X, processed_ids

# --- 1. Data Preparation ---
print("Loading and preprocessing training data...")
df_labels = pd.read_csv(TRAIN_LABELS).dropna(subset=[ID_COL, TARGET_COL])
X_train_full, train_ids = preprocess_spectra(TRAIN_DIR, df_labels[ID_COL].values)

# Align y with the IDs that actually had files
y_train_full = df_labels.set_index(ID_COL).loc[train_ids][TARGET_COL].values

# Split for internal testing
X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# --- 2. Gradient Boosting Model ---
# learning_rate: smaller is usually better but requires more n_estimators
# max_depth: usually 3-5 for boosting to prevent overfitting
gbr = GradientBoostingRegressor(
n_estimators=1000,
learning_rate=0.05,
max_depth=4,
subsample=0.8, # Uses 80% of data for each tree to add randomness
random_state=42,
verbose=1
)

print("\nTraining Gradient Boosting Regressor...")
gbr.fit(X_train, y_train)

# Internal Evaluation
y_pred_internal = gbr.predict(X_test)
print("\n" + "="*30)
print(f"GBR Internal Test Results:")
print(f"R² Score: {r2_score(y_test, y_pred_internal):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_internal)):.4f}")
print("="*30)

# --- 3. Inference on Validation Set ---
print(f"\nProcessing inference for {VAL_DIR}...")
df_valida = pd.read_csv(VAL_LABELS)
X_val, val_ids = preprocess_spectra(VAL_DIR, df_valida[ID_COL].values)

if X_val.size > 0:
val_predictions = gbr.predict(X_val)
# Save Results
results = pd.DataFrame({ID_COL: val_ids, 'predicted_ec': val_predictions})
results.to_csv('valida_predictions_gbr.csv', index=False)
print("Inference results saved to 'valida_predictions_gbr.csv'")

import matplotlib.pyplot as plt
importances = gbr.feature_importances_
plt.plot(importances)
plt.title("Wavelength Importance for EC")
plt.show()


 

 

 

 

 

Nessun commento:

Posta un commento

Hypersoil su campioni Mugello

Durante la tesi di dottorato avevo provato a determinare il contenuto di CaCO3 in suoli naturali basandomi solo sul picco di assorbimento ca...