Durante il dottorato avevo fatto qualche prova con campioni artificiale di terreno ad umidita' controllata ed avevo una ottima correlazione con la misura iperspettrale
I campioni erano di terreno naturale con umidita' aggiunta artificialmente
 |
| Campione di tesi dottorato - illuminazione solare |
Dopo la rimozione del continuum avevo lavorato sugli assorbimenti nello swir
Adesso ho trovato questo lavoro
Felix M. Riese and Sina Keller, “Introducing a Framework of
Self-Organizing Maps for Regression of Soil Moisture with Hyperspectral
Data,” in IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium, Valencia, Spain, 2018, pp. 6151-6154.
i dati sono riportati a questo link in formato csv con 680 spettri VNIR di campagna e relative misure di umidita' realizzate in sito tramite TRIME-PICO time-domain reflectometry (TDR)
questi sono i dati normalizzati con rimozione del continuo
In VNIR non risulta immediato un approccio di misura della profondita' di picco ed ho provato a vedere come si comportava un approccio random forest (ho utilizzato gemini in Google Colab in modo da avere anche l'ottimizzazione dei parametri in modo semplice)
import pandas as pd
# Load the dataset
df = pd.read_csv('/content/soilmoisture_dataset.csv')
display(df.head())
# The third column (index 2) is Y
Y = df.iloc[:, 2]
# Columns from the 19th (index 18) to the end are X
X = df.iloc[:, 18:]
# Drop rows with any NaN values in X or Y for model training
data = pd.concat([X, Y], axis=1).dropna()
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]
print(f"Shape of X: {X.shape}")
print(f"Shape of Y: {Y.shape}")
display(X.head())
display(Y.head())
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Initialize and train the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)
# Make predictions on the test set
Y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# Define the parameter distribution for RandomizedSearchCV
param_dist = {
'n_estimators': np.arange(100, 1000, 100), # Number of trees in the forest
'max_features': ['sqrt', 'log2'], # Number of features to consider at every split
'max_depth': [None, 10, 20, 30, 40, 50], # Maximum number of levels in tree
'min_samples_split': np.arange(2, 20, 2), # Minimum number of samples required to split a node
'min_samples_leaf': np.arange(1, 10, 1), # Minimum number of samples required at each leaf node
'bootstrap': [True, False] # Method for sampling data points (with or without replacement)
}
# Initialize a base Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
# Initialize RandomizedSearchCV
# n_iter controls the number of parameter settings that are sampled.
# cv is the number of folds for cross-validation.
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
n_iter=50, cv=5, verbose=2, random_state=42,
n_jobs=-1, scoring='neg_mean_squared_error')
# Fit the random search model
random_search.fit(X_train, Y_train)
print("Best Parameters found:", random_search.best_params_)
print("Best Negative Mean Squared Error (from CV):", random_search.best_score_)
# Get the best model
best_rf_model = random_search.best_estimator_
# Make predictions with the best model on the test set
Y_pred_tuned = best_rf_model.predict(X_test)
# Evaluate the tuned model
mse_tuned = mean_squared_error(Y_test, Y_pred_tuned)
r2_tuned = r2_score(Y_test, Y_pred_tuned)
print(f"\nPerformance of the tuned Random Forest model on the test set:")
print(f"Mean Squared Error (tuned): {mse_tuned:.4f}")
print(f"R-squared (tuned): {r2_tuned:.4f}")
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 7))
sns.scatterplot(x=Y_test, y=Y_pred, alpha=0.6)
plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'r--', lw=2, label='Perfect Prediction Line') # Add a diagonal line for perfect prediction
plt.title('Actual vs. Predicted Soil Moisture Values (Test Set)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 7))
sns.scatterplot(x=Y_test, y=Y_pred_tuned, alpha=0.6)
plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'r--', lw=2, label='Perfect Prediction Line')
plt.title('Actual vs. Predicted Soil Moisture Values (Tuned Model - Test Set)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 7))
sns.scatterplot(x=Y_test, y=Y_pred_tuned, alpha=0.6)
plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'r--', lw=2, label='Perfect Prediction Line')
plt.title('Actual vs. Predicted Soil Moisture Values (Tuned Model - Test Set)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 7))
sns.scatterplot(x=Y_test, y=Y_pred, alpha=0.6)
plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'r--', lw=2, label='Perfect Prediction Line') # Add a diagonal line for perfect prediction
plt.title('Actual vs. Predicted Soil Moisture Values (Original Model - Test Set)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.histplot(residuals_tuned, kde=True)
plt.title('Distribution of Residuals for Tuned Model')
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()
residuals_tuned = Y_test - Y_pred_tuned
print("First 5 residuals of the tuned model:")
display(residuals_tuned.head())
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Get feature importances from the model
feature_importances = model.feature_importances_
# Create a DataFrame for better visualization
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
# Sort the features by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
# Display the top 10 most important features
print("Top 10 Most Important Features:")
display(importance_df.head(10))
# Plot the top 20 most important features
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20), palette='viridis', hue='Feature', legend=False)
plt.title('Top 20 Most Important Features (Random Forest Regressor)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
Questo e' il grafico del modello non ottimizzato
e questo il grafico ottimizzato
grafico degli errori residui
Come si vede il modello funziona deciamente bene considerando che siamo in condizioni naturali.L'aspetto decisamente piu' interessante e' dal grafico successivo sul peso delle diverse bande nei dati
Come si vede viene estratto il valore di 950 nm come lunghezza d'onda parametro piu' significativo del modello (che e' anche l'estremo del sensore)...questa lunghezza d'onda corrisponde (in realta' sarebbe a 970 nm ma al di fuori del range di misura) della terza armonica del legame O-H. Quindi il modello in maniera autonoma ha individuata una variabile latente (la terza armonica e' un assorbimento molto piu' debole rispetto a quelli presenti nello swir)
Visto che in in linea di principio la posizione dell'assorbimento e' anche funzione della temperatura e visto che il dataset include anche la temperatura del suolo ho provato ad inserire nei valori X del modello anche la temperatura oltre alla riflettanza
Non me lo aspettavo ma il modello e' migliorato sensibilmente.
Questo il codice modificato
import pandas as pd
df = pd.read_csv('/content/soilmoisture_dataset.csv')
print("First 5 rows of the dataset:")
display(df.head())
print("\nInformation about the dataset:")
df.info()
y_columns = ['soil_moisture']
X_columns = ['soil_temperature'] + [col for col in df.columns if col.isdigit()]
X = df[X_columns]
Y = df[y_columns]
print(f"Shape of features (X): {X.shape}")
print(f"Shape of targets (Y): {Y.shape}")
print("\nFirst 5 rows of features (X):")
display(X.head())
print("\nFirst 5 rows of targets (Y):")
display(Y.head())
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")
model = RandomForestRegressor(random_state=42)
model.fit(X_train, Y_train.values.ravel()) # .values.ravel() to flatten Y_train for single output regression
Y_pred = model.predict(X_test)
print("\nFirst 5 predictions:")
print(Y_pred[:5])
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Get feature importances
feature_importances = model.feature_importances_
# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
'Feature': X_columns,
'Importance': feature_importances
})
# Sort by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Top 10 Feature Importances:")
display(importance_df.head(10))
# Visualize top N feature importances
plt.figure(figsize=(12, 7))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title('Top 10 Feature Importances for Soil Moisture and Temperature Prediction')
plt.xlabel('Importance')
plt.ylabel('Reflectance Band (Feature)')
plt.show()
print("Bottom 10 Feature Importances:")
display(importance_df.tail(10))
# Visualize bottom N feature importances
plt.figure(figsize=(12, 7))
sns.barplot(x='Importance', y='Feature', data=importance_df.tail(10))
plt.title('Bottom 10 Feature Importances for Soil Moisture and Temperature Prediction')
plt.xlabel('Importance')
plt.ylabel('Reflectance Band (Feature)')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Create a DataFrame for actual vs. predicted values for easier plotting
results_df = Y_test.copy()
results_df['soil_moisture_pred'] = Y_pred # Y_pred is now a 1D array for single output
# Calculate residuals
results_df['soil_moisture_residuals'] = results_df['soil_moisture'] - results_df['soil_moisture_pred']
# Plotting Actual vs. Predicted for Soil Moisture
plt.figure(figsize=(12, 6))
sns.scatterplot(x='soil_moisture', y='soil_moisture_pred', data=results_df)
plt.plot([results_df['soil_moisture'].min(), results_df['soil_moisture'].max()],
[results_df['soil_moisture'].min(), results_df['soil_moisture'].max()],
color='red', linestyle='--', lw=2)
plt.title('Actual vs. Predicted Soil Moisture')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()
# Plotting Residuals for Soil Moisture
plt.figure(figsize=(12, 6))
sns.scatterplot(x=results_df['soil_moisture_pred'], y=results_df['soil_moisture_residuals'])
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residual Plot for Soil Moisture')
plt.xlabel('Predicted Soil Moisture')
plt.ylabel('Residuals (Actual - Predicted)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Since Y_pred is now a 1D array for a single target, we don't need to iterate or use Y_pred[:, i]
print(f"\n--- Metrics for {y_columns[0]} ---")
mae = mean_absolute_error(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"R-squared (R2) Score: {r2:.3f}")
Nessun commento:
Posta un commento