DHOB (IU5SGN): Bande spettrali Plastiche (Hyperspectral Reflectance Database of Plastic Debris for River Ecosystems)

domenica 12 aprile 2026

Bande spettrali Plastiche (Hyperspectral Reflectance Database of Plastic Debris for River Ecosystems)

Premesso che in laboratorio funziona sempre tutto mentre nel mondo reale non funziona piu' nessun modello in modo accettabile e' interessante il database Hyperspectral Reflectance Database of Plastic Debris for River Ecosystems che si trova all'indirizzo https://zenodo.org/records/13377060 mentre il codice si trova https://github.com/olyae001/Hyperspectral_reflectance_library

Olyaei, M., Ebtehaj, A., & Ellis, C. R. (2024). A Hyperspectral Reflectance Database of Plastic Debris for River Ecosystems [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13377060

Si tratta di misure di laboratorio di diversi materiali plastici (piu' o meno degradati) in fiumi simulati piu' o meno torbidi.. I dati sono in formato netcdf e dentro sono compresi spettri

In github e' compreso lo script Statistical_analysis_Xgboost_Colab.ipynb che permette di calcolare il peso di ogni banda nel modello Xgboost

Come era prevedibile la banda piu' significativa cade nello SWIR ma c'e' segnale anche nel visibile

1) 1178 nm

2) 611 nm

3) 676nnm

4) 559 nm

5) 677 nm

6) 1206 nm

[[1.17800000e+03 8.47618878e-02]
 [6.11000000e+02 4.13786322e-02]
 [6.76000000e+02 3.15730013e-02]
 [5.59000000e+02 2.42403690e-02]
 [6.77000000e+02 2.01107953e-02]
 [1.20600000e+03 1.70316305e-02]
 [4.44000000e+02 1.70264784e-02]
 [2.16600000e+03 1.45021593e-02]
 [2.41100000e+03 1.36355786e-02]
 [1.17300000e+03 1.23202708e-02]]

Questa la matrice di confusione che indica una ottima performance del modello

Il problema e' io ho in uso una camera iperspettrale da drone da 400 a 1000 nm (quindi al di fuori del range della feature spettrale ottimale(...vediamo cosa succede se si usano gli stessi spettri ma limitando tra 400 e 1000 nm la finestra

Tra 600 e 700 nm sono concentrate numerose bande diagnostiche

ma usando solo la parte VNIR il modello peggiora in modo sensibile

Se al posto di XGBoost gli stessi dati vengono processati con Random Forest viene confermato come la parte piu' significative delle plastiche sia concentrata tra 650 e 700 nm

Il problema e' che tra 650 e 700 ci sono concentrati molti segnali delle alghe (picco di fluorescenza a 685 nm, assorbimento dei cianobatteri a 620 nm. molto vicino l'inizio del Red Edge a 700 nm). In condizioni reali questi potrebbe essere le maggiori cause di disturbo

Questo lo script per estrarre le immagini e gli spettri dai file netcdf

import os
import numpy as np
import pandas as pd
import glob
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend for batch saving
import matplotlib.pyplot as plt
import netCDF4
from tabulate import tabulate

# ── Paths ────────────────────────────────────────────────────────────────────
current_directory = os.getcwd()
print("Current directory:", current_directory)

parent_directory = os.path.dirname(current_directory)

subfolder = "openaire/data_NETCDF"
datapath_file = os.path.join(parent_directory, subfolder)
print("Path of data files:", datapath_file)

# ── Output directories ────────────────────────────────────────────────────────
output_root = os.path.join(parent_directory, "openaire", "exported_data")

POLYMERS    = ['PET', 'HDPE', 'LDPE', 'PP', 'EPSF', 'Mix', 'Weathered']
BACKGROUNDS = {'C': 'Clear', 'T': 'Turbid', 'F': 'Foamy'}

# Pre-create all output subdirectories  polymer / background
for polymer in POLYMERS:
    for bg_key, bg_name in BACKGROUNDS.items():
        folder = os.path.join(output_root, polymer, bg_name)
        os.makedirs(folder, exist_ok=True)

# Fallback folder for files that don't match known categories
unknown_folder = os.path.join(output_root, "Unknown")
os.makedirs(unknown_folder, exist_ok=True)

# ── Discover & sort files ─────────────────────────────────────────────────────
netcdf_files = glob.glob(os.path.join(datapath_file, '*.nc4'))

files_number = [int(os.path.basename(f).split('_')[0][1:]) for f in netcdf_files]
sortedlist   = [os.path.basename(f)
                for _, f in sorted(zip(files_number, netcdf_files))]

print(f"\nFound {len(sortedlist)} measurement files.\n")

# ── Summary table (same as original) ─────────────────────────────────────────
polymer_counts    = {}
background_counts = {}

for polymer in POLYMERS:
    polymer_counts[polymer] = len([x for x in sortedlist if polymer in x])

for bg_key in BACKGROUNDS:
    background_counts[bg_key] = len(
        [x for x in sortedlist if bg_key == x.split('_')[2][0]]
    )

table_data = []
for polymer in POLYMERS:
    row = [polymer]
    for bg_key in ['C', 'T', 'F']:
        combo = [x for x in sortedlist
                 if polymer in x and bg_key == x.split('_')[2][0]]
        row.append(len(combo))
    row.append(polymer_counts[polymer])
    table_data.append(row)

table_data.append(["-----"] * 5)
table_data.append(
    ['Sum'] + [background_counts[bg] for bg in ['C', 'T', 'F']]
             + [sum(background_counts.values())]
)

headers = ['Polymer'] + [BACKGROUNDS[bg] for bg in ['C', 'T', 'F']] + ['Sum']
print(tabulate(table_data, headers, tablefmt='pretty'))
print()

# ── Wavelength axis (shared across all files) ─────────────────────────────────
# Will be computed from the first file's reflectance length; reused for all.
wavelengths = None

# ── Main export loop ──────────────────────────────────────────────────────────
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams["font.size"]   = 14

skipped  = []
exported = 0

for idx, filename in enumerate(sortedlist, start=1):
    filepath = os.path.join(datapath_file, filename)

    try:
        nc_file = netCDF4.Dataset(filepath)

        # ── Read metadata ─────────────────────────────────────────────────────
        polymer    = getattr(nc_file, 'Debris Polymer',       'Unknown')
        background = getattr(nc_file, 'Background flow status', 'Unknown')
        plastic_frac = getattr(nc_file, 'Plastic fraction(%)', 'N/A')

        # ── Read variables (force load into numpy arrays) ─────────────────────
        reflectance_data = nc_file.variables['Reflectacne'][:]   # note: typo preserved to match file
        rgb_var          = nc_file.variables['RgbImage'][:]
        # label_var      = nc_file.variables['LabeledImage'][:]  # uncomment if needed

        nc_file.close()

        # ── Wavelength axis ───────────────────────────────────────────────────
        if wavelengths is None or len(wavelengths) != len(reflectance_data):
            wavelengths = np.linspace(350, 2500, len(reflectance_data))

        # ── Determine output subfolder ────────────────────────────────────────
        matched_polymer = next((p for p in POLYMERS if p in filename), None)
        matched_bg_key  = next(
            (bg for bg in ['C', 'T', 'F']
             if bg == filename.split('_')[2][0]),
            None
        )

        if matched_polymer and matched_bg_key:
            out_folder = os.path.join(
                output_root,
                matched_polymer,
                BACKGROUNDS[matched_bg_key]
            )
        else:
            out_folder = unknown_folder

        # ── Shared base filename (strip extension) ────────────────────────────
        base_name = os.path.splitext(filename)[0]   # e.g. "O001_PET_C_..."

        # ── Save reflectance CSV ──────────────────────────────────────────────
        csv_path = os.path.join(out_folder, base_name + '.csv')
        df = pd.DataFrame({
            'wavelength_nm':  wavelengths,
            'reflectance':    reflectance_data.flatten(),
        })
        # Prepend metadata columns so the CSV is self-describing
        df.insert(0, 'observation',       idx)
        df.insert(1, 'filename',          filename)
        df.insert(2, 'polymer',           polymer)
        df.insert(3, 'background',        background)
        df.insert(4, 'plastic_fraction',  plastic_frac)
        df.to_csv(csv_path, index=False)

        # ── Save RGB image ────────────────────────────────────────────────────
        rgb_image_data = np.transpose(rgb_var, (2, 1, 0))   # match MATLAB order

        img_path = os.path.join(out_folder, base_name + '_rgb.png')

        fig, ax = plt.subplots(figsize=(8, 6))
        ax.imshow(rgb_image_data)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title(
            f"O#{idx}  |  Polymer: {polymer}  |  Background: {background}"
            f"  |  Plastic fraction: {plastic_frac}%",
            fontsize=11
        )
        fig.tight_layout()
        fig.savefig(img_path, dpi=150, bbox_inches='tight')
        plt.close(fig)

        # ── Save reflectance spectrum ─────────────────────────────────────────
        spectrum_path = os.path.join(out_folder, base_name + '_spectrum.png')

        fig, ax = plt.subplots(figsize=(12, 5))
        ax.plot(wavelengths, reflectance_data.flatten(), linewidth=2.0,
                color='steelblue', label='Reflectance')
        ax.set_xlabel('Wavelength (nm)')
        ax.set_ylabel('Reflectance')
        ax.set_xlim(350, 2500)
        ax.set_title(
            f"O#{idx}  |  Polymer: {polymer}  |  Background: {background}"
            f"  |  Plastic fraction: {plastic_frac}%",
            fontsize=11
        )
        ax.grid(True, which='major', linestyle='--', color='gray', alpha=0.7)
        ax.legend(fontsize=12)
        fig.tight_layout()
        fig.savefig(spectrum_path, dpi=150, bbox_inches='tight')
        plt.close(fig)

        exported += 1
        print(f"[{idx:>4}/{len(sortedlist)}]  Saved: {base_name}_rgb.png  |  _spectrum.png  |  .csv")

    except Exception as e:
        print(f"[{idx:>4}/{len(sortedlist)}]  SKIPPED {filename}: {e}")
        skipped.append((filename, str(e)))

# ── Final report ──────────────────────────────────────────────────────────────
print(f"\n{'='*60}")
print(f"Export complete:  {exported} files saved,  {len(skipped)} skipped.")
if skipped:
    print("\nSkipped files:")
    for fname, reason in skipped:
        print(f"  {fname}: {reason}")
print(f"\nOutput root: {output_root}")