Si tratta di misure di laboratorio di diversi materiali plastici (piu' o meno degradati) in fiumi simulati piu' o meno torbidi.. I dati sono in formato netcdf e dentro sono compresi spettri
In github e' compreso lo script Statistical_analysis_Xgboost_Colab.ipynb che permette di calcolare il peso di ogni banda nel modello Xgboost
Come era prevedibile la banda piu' significativa cade nello SWIR ma c'e' segnale anche nel visibile
Il problema e' io ho in uso una camera iperspettrale da drone da 400 a 1000 nm (quindi al di fuori del range della feature spettrale ottimale(...vediamo cosa succede se si usano gli stessi spettri ma limitando tra 400 e 1000 nm la finestra
Se al posto di XGBoost gli stessi dati vengono processati con Random Forest viene confermato come la parte piu' significative delle plastiche sia concentrata tra 650 e 700 nm
Il problema e' che tra 650 e 700 ci sono concentrati molti segnali delle alghe (picco di fluorescenza a 685 nm, assorbimento dei cianobatteri a 620 nm. molto vicino l'inizio del Red Edge a 700 nm). In condizioni reali questi potrebbe essere le maggiori cause di disturbo
import os
import numpy as np
import pandas as pd
import glob
import matplotlib
matplotlib.use('Agg') # Non-interactive backend for batch saving
import matplotlib.pyplot as plt
import netCDF4
from tabulate import tabulate
# ── Paths ────────────────────────────────────────────────────────────────────
current_directory = os.getcwd()
print("Current directory:", current_directory)
parent_directory = os.path.dirname(current_directory)
subfolder = "openaire/data_NETCDF"
datapath_file = os.path.join(parent_directory, subfolder)
print("Path of data files:", datapath_file)
# ── Output directories ────────────────────────────────────────────────────────
output_root = os.path.join(parent_directory, "openaire", "exported_data")
POLYMERS = ['PET', 'HDPE', 'LDPE', 'PP', 'EPSF', 'Mix', 'Weathered']
BACKGROUNDS = {'C': 'Clear', 'T': 'Turbid', 'F': 'Foamy'}
# Pre-create all output subdirectories polymer / background
for polymer in POLYMERS:
for bg_key, bg_name in BACKGROUNDS.items():
folder = os.path.join(output_root, polymer, bg_name)
os.makedirs(folder, exist_ok=True)
# Fallback folder for files that don't match known categories
unknown_folder = os.path.join(output_root, "Unknown")
os.makedirs(unknown_folder, exist_ok=True)
# ── Discover & sort files ─────────────────────────────────────────────────────
netcdf_files = glob.glob(os.path.join(datapath_file, '*.nc4'))
files_number = [int(os.path.basename(f).split('_')[0][1:]) for f in netcdf_files]
sortedlist = [os.path.basename(f)
for _, f in sorted(zip(files_number, netcdf_files))]
print(f"\nFound {len(sortedlist)} measurement files.\n")
# ── Summary table (same as original) ─────────────────────────────────────────
polymer_counts = {}
background_counts = {}
for polymer in POLYMERS:
polymer_counts[polymer] = len([x for x in sortedlist if polymer in x])
for bg_key in BACKGROUNDS:
background_counts[bg_key] = len(
[x for x in sortedlist if bg_key == x.split('_')[2][0]]
)
table_data = []
for polymer in POLYMERS:
row = [polymer]
for bg_key in ['C', 'T', 'F']:
combo = [x for x in sortedlist
if polymer in x and bg_key == x.split('_')[2][0]]
row.append(len(combo))
row.append(polymer_counts[polymer])
table_data.append(row)
table_data.append(["-----"] * 5)
table_data.append(
['Sum'] + [background_counts[bg] for bg in ['C', 'T', 'F']]
+ [sum(background_counts.values())]
)
headers = ['Polymer'] + [BACKGROUNDS[bg] for bg in ['C', 'T', 'F']] + ['Sum']
print(tabulate(table_data, headers, tablefmt='pretty'))
print()
# ── Wavelength axis (shared across all files) ─────────────────────────────────
# Will be computed from the first file's reflectance length; reused for all.
wavelengths = None
# ── Main export loop ──────────────────────────────────────────────────────────
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams["font.size"] = 14
skipped = []
exported = 0
for idx, filename in enumerate(sortedlist, start=1):
filepath = os.path.join(datapath_file, filename)
try:
nc_file = netCDF4.Dataset(filepath)
# ── Read metadata ─────────────────────────────────────────────────────
polymer = getattr(nc_file, 'Debris Polymer', 'Unknown')
background = getattr(nc_file, 'Background flow status', 'Unknown')
plastic_frac = getattr(nc_file, 'Plastic fraction(%)', 'N/A')
# ── Read variables (force load into numpy arrays) ─────────────────────
reflectance_data = nc_file.variables['Reflectacne'][:] # note: typo preserved to match file
rgb_var = nc_file.variables['RgbImage'][:]
# label_var = nc_file.variables['LabeledImage'][:] # uncomment if needed
nc_file.close()
# ── Wavelength axis ───────────────────────────────────────────────────
if wavelengths is None or len(wavelengths) != len(reflectance_data):
wavelengths = np.linspace(350, 2500, len(reflectance_data))
# ── Determine output subfolder ────────────────────────────────────────
matched_polymer = next((p for p in POLYMERS if p in filename), None)
matched_bg_key = next(
(bg for bg in ['C', 'T', 'F']
if bg == filename.split('_')[2][0]),
None
)
if matched_polymer and matched_bg_key:
out_folder = os.path.join(
output_root,
matched_polymer,
BACKGROUNDS[matched_bg_key]
)
else:
out_folder = unknown_folder
# ── Shared base filename (strip extension) ────────────────────────────
base_name = os.path.splitext(filename)[0] # e.g. "O001_PET_C_..."
# ── Save reflectance CSV ──────────────────────────────────────────────
csv_path = os.path.join(out_folder, base_name + '.csv')
df = pd.DataFrame({
'wavelength_nm': wavelengths,
'reflectance': reflectance_data.flatten(),
})
# Prepend metadata columns so the CSV is self-describing
df.insert(0, 'observation', idx)
df.insert(1, 'filename', filename)
df.insert(2, 'polymer', polymer)
df.insert(3, 'background', background)
df.insert(4, 'plastic_fraction', plastic_frac)
df.to_csv(csv_path, index=False)
# ── Save RGB image ────────────────────────────────────────────────────
rgb_image_data = np.transpose(rgb_var, (2, 1, 0)) # match MATLAB order
img_path = os.path.join(out_folder, base_name + '_rgb.png')
fig, ax = plt.subplots(figsize=(8, 6))
ax.imshow(rgb_image_data)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title(
f"O#{idx} | Polymer: {polymer} | Background: {background}"
f" | Plastic fraction: {plastic_frac}%",
fontsize=11
)
fig.tight_layout()
fig.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close(fig)
# ── Save reflectance spectrum ─────────────────────────────────────────
spectrum_path = os.path.join(out_folder, base_name + '_spectrum.png')
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(wavelengths, reflectance_data.flatten(), linewidth=2.0,
color='steelblue', label='Reflectance')
ax.set_xlabel('Wavelength (nm)')
ax.set_ylabel('Reflectance')
ax.set_xlim(350, 2500)
ax.set_title(
f"O#{idx} | Polymer: {polymer} | Background: {background}"
f" | Plastic fraction: {plastic_frac}%",
fontsize=11
)
ax.grid(True, which='major', linestyle='--', color='gray', alpha=0.7)
ax.legend(fontsize=12)
fig.tight_layout()
fig.savefig(spectrum_path, dpi=150, bbox_inches='tight')
plt.close(fig)
exported += 1
print(f"[{idx:>4}/{len(sortedlist)}] Saved: {base_name}_rgb.png | _spectrum.png | .csv")
except Exception as e:
print(f"[{idx:>4}/{len(sortedlist)}] SKIPPED {filename}: {e}")
skipped.append((filename, str(e)))
# ── Final report ──────────────────────────────────────────────────────────────
print(f"\n{'='*60}")
print(f"Export complete: {exported} files saved, {len(skipped)} skipped.")
if skipped:
print("\nSkipped files:")
for fname, reason in skipped:
print(f" {fname}: {reason}")
print(f"\nOutput root: {output_root}")
Nessun commento:
Posta un commento