import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_csv('/content/soilmoisture_dataset.csv')
# Extract Y values (3rd column, index 2)
Y = df.iloc[:, 2] # Assuming the 3rd column (index 2) is 'soil_moisture'
# Extract X values (columns from 5th to the end, index 4 onwards)
X = df.iloc[:, 4:] # Assuming columns from index 4 onwards are reflectance values
# Split data into training and testing sets (optional, but good practice for model evaluation)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Initialize and train the PLSR model
# The number of components (n_components) is a hyperparameter that might need tuning.
# A common starting point is to use a small number, e.g., 2 or 3, or determine it through cross-validation.
pls = PLSRegression(n_components=9) # Changed to 4 components as requested
pls.fit(X_train, Y_train)
print("\nPLSR model trained successfully!")
# Make predictions on the test set
Y_pred = pls.predict(X_test)
# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2) score: {r2:.4f}")
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
import matplotlib.pyplot as plt
max_components = min(X_train.shape[1], X_train.shape[0] - 1)
components_to_test = np.arange(1, min(max_components + 1, 16))
mse_scores = []
r2_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print(f"Tuning n_components from 1 to {components_to_test[-1]} using 5-Fold Cross-Validation...")
for n_comp in components_to_test:
pls_cv = PLSRegression(n_components=n_comp)
# Calculate negative mean squared error (sklearn's cross_val_score minimizes, so use negative MSE)
# And R-squared score
nmse = cross_val_score(pls_cv, X_train, Y_train, cv=kf, scoring='neg_mean_squared_error')
r2 = cross_val_score(pls_cv, X_train, Y_train, cv=kf, scoring='r2')
mse_scores.append(-np.mean(nmse)) # Convert back to positive MSE
r2_scores.append(np.mean(r2))
# Find the optimal n_components based on the minimum MSE
best_n_components_mse = components_to_test[np.argmin(mse_scores)]
min_mse = np.min(mse_scores)
# Find the optimal n_components based on the maximum R2
best_n_components_r2 = components_to_test[np.argmax(r2_scores)]
max_r2 = np.max(r2_scores)
print(f"\nOptimal n_components (min MSE): {best_n_components_mse} (MSE: {min_mse:.4f})")
print(f"Optimal n_components (max R2): {best_n_components_r2} (R2: {max_r2:.4f})")
# Plotting the results
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(components_to_test, mse_scores, marker='o')
plt.title('PLSR MSE vs. Number of Components (Cross-Validation)')
plt.xlabel('Number of Components')
plt.ylabel('Mean Squared Error (MSE)')
plt.xticks(components_to_test)
plt.grid(True)
plt.axvline(x=best_n_components_mse, color='r', linestyle='--', label=f'Optimal: {best_n_components_mse}')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(components_to_test, r2_scores, marker='o', color='green')
plt.title('PLSR R-squared vs. Number of Components (Cross-Validation)')
plt.xlabel('Number of Components')
plt.ylabel('R-squared (R2)')
plt.xticks(components_to_test)
plt.grid(True)
plt.axvline(x=best_n_components_r2, color='r', linestyle='--', label=f'Optimal: {best_n_components_r2}')
plt.legend()
plt.tight_layout()
plt.show()
# Retrain the model with the best n_components and evaluate on the test set
final_pls_model = PLSRegression(n_components=best_n_components_r2) # Using R2 optimized components
final_pls_model.fit(X_train, Y_train)
Y_pred_tuned = final_pls_model.predict(X_test)
mse_tuned = mean_squared_error(Y_test, Y_pred_tuned)
r2_tuned = r2_score(Y_test, Y_pred_tuned)
print(f"\nModel re-trained with optimal n_components ({best_n_components_r2}).")
print(f"Test Set Mean Squared Error (MSE) after tuning: {mse_tuned:.4f}")
print(f"Test Set R-squared (R2) score after tuning: {r2_tuned:.4f}")
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.scatterplot(x=Y_test, y=Y_pred_tuned.flatten(), color='blue', alpha=0.7)
plt.plot([min(Y_test), max(Y_test)], [min(Y_test), max(Y_test)], color='red', linestyle='--', lw=2, label='Perfect Prediction')
plt.title('Actual vs. Predicted Soil Moisture (Tuned PLSR Model)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.grid(True)
plt.legend()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Get the names of the top 10 predictor variables
top_10_features = sorted_importance.head(10).index
# Set up the figure and axes for subplots
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 15))
axes = axes.flatten() # Flatten the 2D array of axes for easy iteration
fig.suptitle('Relationship Between Top Predictor Wavelengths and Soil Moisture', fontsize=16)
# Iterate through the top 10 features and create a scatter plot for each
for i, feature in enumerate(top_10_features):
sns.scatterplot(x=X_train[feature], y=Y_train, ax=axes[i], alpha=0.6)
axes[i].set_title(f'Wavelength: {feature} nm')
axes[i].set_xlabel(f'Reflectance at {feature} nm')
axes[i].set_ylabel('Soil Moisture')
axes[i].grid(True, linestyle='--', alpha=0.7)
# Hide any unused subplots
for j in range(len(top_10_features), len(axes)):
fig.delaxes(axes[j])
plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to prevent title overlap
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# Get original wavelengths from X_original column names
original_wavelengths = X_original.columns.astype(float).tolist()
# Calculate mid-points for derivative wavelengths
derivative_wavelengths = []
for i in range(len(original_wavelengths) - 1):
mid_point = (original_wavelengths[i] + original_wavelengths[i+1]) / 2
derivative_wavelengths.append(mid_point)
# Convert derivative_wavelengths to a numpy array for plotting
derivative_wavelengths = np.array(derivative_wavelengths)
plt.figure(figsize=(12, 7))
# Plot all derivative spectra
for i in range(X_deriv.shape[0]):
plt.plot(derivative_wavelengths, X_deriv.iloc[i], alpha=0.5, color='blue') # Use a single color and slight transparency
plt.title('All First Derivative of Reflectance Spectra')
plt.xlabel('Wavelength (nm)')
plt.ylabel('First Derivative of Reflectance')
plt.grid(True, linestyle='--', alpha=0.7)
# Removed legend as plotting all samples makes it unreadable
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.scatterplot(x=Y_test, y=Y_pred.flatten(), color='green', alpha=0.7)
plt.plot([min(Y_test), max(Y_test)], [min(Y_test), max(Y_test)], color='red', linestyle='--', lw=2, label='Perfect Prediction')
plt.title('Actual vs. Predicted Soil Moisture (Derivative Model)')
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.grid(True)
plt.legend()
plt.show()
# Extract Y values (3rd column, index 2)
Y = df.iloc[:, 2] # Assuming the 3rd column (index 2) is 'soil_moisture'
# Extract X values (columns from 5th to the end, index 4 onwards) - original reflectance
X_original = df.iloc[:, 4:] # Assuming columns from index 4 onwards are reflectance values
# Calculate the first derivative of reflectance
# Using np.diff along axis=1 (columns) for each row (sample)
X_deriv = pd.DataFrame(np.diff(X_original.values, axis=1))
# The new column names will correspond to the mid-points or just be sequential
# For simplicity, let's create generic names for the derivative columns
derivative_col_names = [f'deriv_{col_idx}' for col_idx in range(X_deriv.shape[1])]
X_deriv.columns = derivative_col_names
# Now, use the derivative as the feature matrix X
X = X_deriv
# Display the shapes of X and Y to confirm
print(f"Shape of X (first derivative reflectance values): {X.shape}")
print(f"Shape of Y (soil_moisture): {Y.shape}")
# Split data into training and testing sets (optional, but good practice for model evaluation)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Initialize and train the PLSR model
# The number of components (n_components) is a hyperparameter that might need tuning.
# A common starting point is to use a small number, e.g., 2 or 3, or determine it through cross-validation.
pls = PLSRegression(n_components=4) # Changed to 4 components as requested
pls.fit(X_train, Y_train)
print("\nPLSR model trained successfully!")
# Make predictions on the test set
Y_pred = pls.predict(X_test)
# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2) score: {r2:.4f}")
"""The predictor variables are represented by column names that are numerical, likely corresponding to specific wavelengths (in nanometers) from the reflectance spectrum. These wavelengths are the features identified by the PLSR model as having the strongest influence on predicting soil moisture.
**Interpreting the Coefficients:**
The regression coefficients obtained from the PLSR model quantify the relationship between each predictor variable (wavelength) and the target variable (soil moisture).:
* **Magnitude:** The absolute value of a coefficient indicates the strength of the variable's influence. A larger absolute coefficient means that a small change in that wavelength's reflectance has a greater impact on the predicted soil moisture.
* **Sign (Positive/Negative):**
* A **positive coefficient** indicates a positive correlation: as the reflectance at that wavelength increases, the soil moisture is predicted to increase.
* A **negative coefficient** indicates a negative correlation: as the reflectance at that wavelength increases, the soil moisture is predicted to decrease.
In the context of soil moisture prediction from reflectance data, specific wavelengths are known to be more sensitive to water absorption. The model has highlighted these critical wavelengths, allowing us to understand which parts of the spectrum are most informative for estimating soil moisture content.
Based on the analysis, the top 10 most important predictor variables (wavelengths) and their respective absolute coefficient values are:
"""
display(sorted_importance.head(10))