DHOB (IU5SGN): Soil Moisture con dati iperspettrali

lunedì 23 marzo 2026

Soil Moisture con dati iperspettrali

Durante il dottorato avevo fatto qualche prova con campioni artificiale di terreno ad umidita' controllata ed avevo una ottima correlazione con la misura iperspettrale

I campioni erano di terreno naturale con umidita' aggiunta artificialmente

Campione di tesi dottorato - illuminazione solare

Dopo la rimozione del continuum avevo lavorato sugli assorbimenti nello swir

Adesso ho trovato questo lavoro

Felix M. Riese and Sina Keller, “Introducing a Framework of Self-Organizing Maps for Regression of Soil Moisture with Hyperspectral Data,” in IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium, Valencia, Spain, 2018, pp. 6151-6154.

i dati sono riportati a questo link in formato csv con 680 spettri VNIR di campagna e relative misure di umidita' realizzate in sito tramite TRIME-PICO time-domain reflectometry (TDR)

questi sono i dati normalizzati con rimozione del continuo

In VNIR non risulta immediato un approccio di misura della profondita' di picco ed ho provato a vedere come si comportava un approccio random forest (ho utilizzato gemini in Google Colab in modo da avere anche l'ottimizzazione dei parametri in modo semplice)

import pandas as pd

# Load the dataset
df = pd.read_csv('/content/soilmoisture_dataset.csv')
display(df.head())


# The third column (index 2) is Y
Y = df.iloc[:, 2]

# Columns from the 19th (index 18) to the end are X
X = df.iloc[:, 18:]

# Drop rows with any NaN values in X or Y for model training
data = pd.concat([X, Y], axis=1).dropna()
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

print(f"Shape of X: {X.shape}")
print(f"Shape of Y: {Y.shape}")

display(X.head())
display(Y.head())


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': np.arange(100, 1000, 100), # Number of trees in the forest
    'max_features': ['sqrt', 'log2'],   # Number of features to consider at every split
    'max_depth': [None, 10, 20, 30, 40, 50],   # Maximum number of levels in tree
    'min_samples_split': np.arange(2, 20, 2),  # Minimum number of samples required to split a node
    'min_samples_leaf': np.arange(1, 10, 1),   # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]                 # Method for sampling data points (with or without replacement)
}

# Initialize a base Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV
# n_iter controls the number of parameter settings that are sampled.
# cv is the number of folds for cross-validation.
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=50, cv=5, verbose=2, random_state=42,
                                   n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the random search model
random_search.fit(X_train, Y_train)

print("Best Parameters found:", random_search.best_params_)
print("Best Negative Mean Squared Error (from CV):", random_search.best_score_)

# Get the best model
best_rf_model = random_search.best_estimator_

# Make predictions with the best model on the test set
Y_pred_tuned = best_rf_model.predict(X_test)

# Evaluate the tuned model
mse_tuned = mean_squared_error(Y_test, Y_pred_tuned)
r2_tuned = r2_score(Y_test, Y_pred_tuned)

print(f"\nPerformance of the tuned Random Forest model on the test set:")
print(f"Mean Squared Error (tuned): {mse_tuned:.4f}")
print(f"R-squared (tuned): {r2_tuned:.4f}")

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 7))
sns.scatterplot(x=Y_test, y=Y_pred, alpha=0.6)
plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'r--', lw=2, label='Perfect Prediction Line') # Add a diagonal line for perfect prediction
plt.title('Actual vs. Predicted Soil Moisture Values (Test Set)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 7))
sns.scatterplot(x=Y_test, y=Y_pred_tuned, alpha=0.6)
plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'r--', lw=2, label='Perfect Prediction Line')
plt.title('Actual vs. Predicted Soil Moisture Values (Tuned Model - Test Set)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 7))
sns.scatterplot(x=Y_test, y=Y_pred_tuned, alpha=0.6)
plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'r--', lw=2, label='Perfect Prediction Line')
plt.title('Actual vs. Predicted Soil Moisture Values (Tuned Model - Test Set)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 7))
sns.scatterplot(x=Y_test, y=Y_pred, alpha=0.6)
plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'r--', lw=2, label='Perfect Prediction Line') # Add a diagonal line for perfect prediction
plt.title('Actual vs. Predicted Soil Moisture Values (Original Model - Test Set)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(residuals_tuned, kde=True)
plt.title('Distribution of Residuals for Tuned Model')
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()

residuals_tuned = Y_test - Y_pred_tuned
print("First 5 residuals of the tuned model:")
display(residuals_tuned.head())

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances from the model
feature_importances = model.feature_importances_

# Create a DataFrame for better visualization
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the features by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the top 10 most important features
print("Top 10 Most Important Features:")
display(importance_df.head(10))

# Plot the top 20 most important features
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20), palette='viridis', hue='Feature', legend=False)
plt.title('Top 20 Most Important Features (Random Forest Regressor)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

Questo e' il grafico del modello non ottimizzato

e questo il grafico ottimizzato

grafico degli errori residui

Come si vede il modello funziona deciamente bene considerando che siamo in condizioni naturali.L'aspetto decisamente piu' interessante e' dal grafico successivo sul peso delle diverse bande nei dati

Come si vede viene estratto il valore di 950 nm come lunghezza d'onda parametro piu' significativo del modello (che e' anche l'estremo del sensore)...questa lunghezza d'onda corrisponde (in realta' sarebbe a 970 nm ma al di fuori del range di misura) della terza armonica del legame O-H. Quindi il modello in maniera autonoma ha individuata una variabile latente (la terza armonica e' un assorbimento molto piu' debole rispetto a quelli presenti nello swir)

Visto che in in linea di principio la posizione dell'assorbimento e' anche funzione della temperatura e visto che il dataset include anche la temperatura del suolo ho provato ad inserire nei valori X del modello anche la temperatura oltre alla riflettanza

Non me lo aspettavo ma il modello e' migliorato sensibilmente.

Questo il codice modificato

import pandas as pd

df = pd.read_csv('/content/soilmoisture_dataset.csv')

print("First 5 rows of the dataset:")
display(df.head())

print("\nInformation about the dataset:")
df.info()

y_columns = ['soil_moisture']
X_columns = ['soil_temperature'] + [col for col in df.columns if col.isdigit()]

X = df[X_columns]
Y = df[y_columns]

print(f"Shape of features (X): {X.shape}")
print(f"Shape of targets (Y): {Y.shape}")

print("\nFirst 5 rows of features (X):")
display(X.head())

print("\nFirst 5 rows of targets (Y):")
display(Y.head())

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

model = RandomForestRegressor(random_state=42)
model.fit(X_train, Y_train.values.ravel()) # .values.ravel() to flatten Y_train for single output regression

Y_pred = model.predict(X_test)

print("\nFirst 5 predictions:")
print(Y_pred[:5])

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': X_columns,
    'Importance': feature_importances
})

# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Top 10 Feature Importances:")
display(importance_df.head(10))

# Visualize top N feature importances
plt.figure(figsize=(12, 7))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title('Top 10 Feature Importances for Soil Moisture and Temperature Prediction')
plt.xlabel('Importance')
plt.ylabel('Reflectance Band (Feature)')
plt.show()

print("Bottom 10 Feature Importances:")
display(importance_df.tail(10))

# Visualize bottom N feature importances
plt.figure(figsize=(12, 7))
sns.barplot(x='Importance', y='Feature', data=importance_df.tail(10))
plt.title('Bottom 10 Feature Importances for Soil Moisture and Temperature Prediction')
plt.xlabel('Importance')
plt.ylabel('Reflectance Band (Feature)')
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

# Create a DataFrame for actual vs. predicted values for easier plotting
results_df = Y_test.copy()
results_df['soil_moisture_pred'] = Y_pred # Y_pred is now a 1D array for single output

# Calculate residuals
results_df['soil_moisture_residuals'] = results_df['soil_moisture'] - results_df['soil_moisture_pred']

# Plotting Actual vs. Predicted for Soil Moisture
plt.figure(figsize=(12, 6))
sns.scatterplot(x='soil_moisture', y='soil_moisture_pred', data=results_df)
plt.plot([results_df['soil_moisture'].min(), results_df['soil_moisture'].max()],
         [results_df['soil_moisture'].min(), results_df['soil_moisture'].max()],
         color='red', linestyle='--', lw=2)
plt.title('Actual vs. Predicted Soil Moisture')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

# Plotting Residuals for Soil Moisture
plt.figure(figsize=(12, 6))
sns.scatterplot(x=results_df['soil_moisture_pred'], y=results_df['soil_moisture_residuals'])
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residual Plot for Soil Moisture')
plt.xlabel('Predicted Soil Moisture')
plt.ylabel('Residuals (Actual - Predicted)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Since Y_pred is now a 1D array for a single target, we don't need to iterate or use Y_pred[:, i]
print(f"\n--- Metrics for {y_columns[0]} ---")
mae = mean_absolute_error(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"R-squared (R2) Score: {r2:.3f}")

visto che fare esperimenti usando la AI e' estremamente veloce ho provato ad elaborare i dati usando una 1D-CNN. I risultati sono stati pessimi

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

"""### Load the dataset"""

df = pd.read_csv('/content/soilmoisture_dataset.csv', header=0)
display(df.head())

"""### Extract x and y values"""

# The third column is y (index 2)
y = df.iloc[:, 2].values

# Columns from 5 to the end are x values (reflectance)
x = df.iloc[:, 4:].values

print(f"Shape of x: {x.shape}")
print(f"Shape of y: {y.shape}")

"""### Preprocess data for 1D CNN"""

# Reshape x for 1D CNN (samples, timesteps, features)
# In our case, timesteps are the reflectance values and features is 1
x = x.reshape(x.shape[0], x.shape[1], 1)

print(f"Shape of x after reshaping: {x.shape}")

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(f"Shape of x_train: {x_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of x_test: {x_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

"""### Create and train a 1D CNN model"""

model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(x.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(50, activation='relu'),
    Dense(1) # Output layer for regression
])

model.compile(optimizer='adam', loss='mse')
model.summary()

history = model.fit(x_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
loss = model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")

import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss over Epochs')
plt.ylabel('Loss (Mean Squared Error)')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()

y_pred = model.predict(x_test)
print(f"Shape of predictions: {y_pred.shape}")
print("First 5 predictions:")
for i in range(5):
    print(f"Predicted: {y_pred[i][0]:.2f}, Actual: {y_test[i]:.2f}")

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions with the re-trained model
y_pred = model.predict(x_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Calculate R-squared (R2 Score)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2 Score): {r2:.2f}")

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Soil Moisture Values (MLP Model)')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, color='red') # Diagonal line
plt.grid(True)
plt.show()

DHOB (IU5SGN)

lunedì 23 marzo 2026

Soil Moisture con dati iperspettrali

Nessun commento:

Posta un commento

Aggiornamento algoritmi di spectral unmxing

Clock

Ricerca

analitcs