DHOB (IU5SGN)

domenica 24 agosto 2025

Rete neurale riconoscimento litologie The Drill Core Image Dataset (DCID)

Aggiornamento : lo script puo' essere modificato per fare girare EfficientNetB4 ma il tempo di training si allunga decisamente. Su M1 ogni epoch richiede almeno 25 minuti

=====================================

Ho trovato un dataset di fotografie ad alta qualita' 512x512 di carote di sondaggio annotate con la litologia corrispondente. https://github.com/JiayuLi1120/drill-core-image-dataset)

Ovviamente studi di questo genere sono gia' presenti come https://www.sciencedirect.com/science/article/pii/S0920410520309888

Si tratta di due dataset, il primo da 7 categorie di litologie, il secondo da 35 categorie. Ho usato il secondo che utilizza 800 immagini di train e 200 fotografie di test per ogni litologia

Esempio foto training

il modello prevede transfer learning da EfficientNetB0 per velocizzare il calcolo

il modello dal punto di vista della rete neurale funziona ma quando viene messo alla prova fallisce spesso..per esempio queste due foto sottostanti rientrerebbero nella categoria Mudstone ma vengonon predette in modo differente (lo score della seconda peraltro e' cosi' basso che dovrebbe essere scartata)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import json
import os
import numpy as np

# Clear session
tf.keras.backend.clear_session()

# --- Paths ---
train_dir = "../drill_core/DCID-512-35/train"
val_dir = "../drill_core/DCID-512-35/test"

# --- Parameters ---
batch_size = 16
img_size = (224, 224)

# --- Verify data directories ---
for path in [train_dir, val_dir]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Directory not found: {path}")

# --- Load datasets as RGB ---
print("Loading training data (RGB)...")
train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    image_size=img_size,
    batch_size=batch_size,
    label_mode="int",
    color_mode="rgb",
    shuffle=True,
    seed=123
)

print("Loading validation data (RGB)...")
val_ds = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    image_size=img_size,
    batch_size=batch_size,
    label_mode="int",
    color_mode="rgb",
    shuffle=False
)

# Debug: Check one batch
for images, labels in train_ds.take(1):
    print(f"Sample batch shape: {images.shape}")
    print(f"Pixel range: {tf.reduce_min(images):.1f} to {tf.reduce_max(images):.1f}")
    break

# --- Save class names ---
class_names = train_ds.class_names
num_classes = len(class_names)
with open("class_names.json", "w") as f:
    json.dump(class_names, f)
print("Classes:", class_names)

# --- Performance optimization ---
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# --- Data Augmentation ---
data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
])

# --- STEP: Download EfficientNetB0 weights WITHOUT hash check ---
print("Downloading EfficientNetB0 weights (skipping hash check)...")
try:
    weights_path = keras.utils.get_file(
        fname="efficientnetb0_notop.h5",
        origin="https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5",
        cache_subdir="models",
        file_hash=None  # ←←← SKIP HASH CHECK
    )
    print(f"✅ Weights downloaded at: {weights_path}")
except Exception as e:
    print(f"❌ Download failed: {e}")
    raise

# --- Build Model ---
print("Building model...")
base_model = keras.applications.EfficientNetB0(
    input_shape=img_size + (3,),
    include_top=False,
    weights=None  # Will load manually
)

# Load weights with skip_mismatch=True (robust)
base_model.load_weights(weights_path, by_name=True, skip_mismatch=True)
print("✅ Pretrained weights loaded (mismatches skipped)")

# Freeze base
base_model.trainable = False

# Add custom head
inputs = keras.Input(shape=img_size + (3,))
x = data_augmentation(inputs)
x = keras.applications.efficientnet.preprocess_input(x)
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs, outputs)

# Compile
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

print("\n✅ Model built successfully!")
model.summary()

# --- Callbacks ---
early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)

# --- Phase 1: Train head ---
print("\n🚀 Phase 1: Training classifier head")
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=[early_stop]
)

# --- Phase 2: Fine-tune top layers ---
print("\n🚀 Phase 2: Fine-tuning last 20 layers")
base_model.trainable = True
fine_tune_at = len(base_model.layers) - 20
for layer in base_model.layers[:fine_tune_at]:
    layer.trainable = False

model.compile(
    optimizer=keras.optimizers.Adam(1e-5),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

history_fine = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[early_stop]
)

# --- Save model ---
model.save("drill_core_model_rgb.keras")
print("\n🎉 Model saved as 'drill_core_model_rgb.keras'")

# --- Plot results ---
plt.figure(figsize=(10, 6))
plt.plot(history.history["accuracy"], label="Train Acc (Phase 1)")
plt.plot(history.history["val_accuracy"], label="Val Acc (Phase 1)")
if 'history_fine' in locals():
    plt.plot(range(len(history.history["accuracy"]), len(history.history["accuracy"]) + len(history_fine.history["accuracy"])),
             history_fine.history["accuracy"], label="Train Acc (Fine-tune)")
    plt.plot(range(len(history.history["val_accuracy"]), len(history.history["val_accuracy"]) + len(history_fine.history["val_accuracy"])),
             history_fine.history["val_accuracy"], label="Val Acc (Fine-tune)")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Training History")
plt.legend()
plt.grid(True)
plt.show()

# Final eval
loss, acc = model.evaluate(val_ds)
print(f"Final accuracy: {acc:.4f}")

✅ Model built successfully!

Model: "functional_1"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓

┃ Layer (type) ┃ Output Shape ┃ Param # ┃

┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩

│ input_layer_1 (InputLayer) │ (None, 224, 224, 3) │ 0 │

├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤

│ sequential (Sequential) │ (None, 224, 224, 3) │ 0 │

│ efficientnetb0 (Functional) │ (None, 7, 7, 1280) │ 4,049,571 │

│ global_average_pooling2d │ (None, 1280) │ 0 │

│ (GlobalAveragePooling2D) │ │ │

│ dropout (Dropout) │ (None, 1280) │ 0 │

│ dense (Dense) │ (None, 35) │ 44,835 │

└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘

Total params: 4,094,406 (15.62 MB)

Trainable params: 44,835 (175.14 KB)

Non-trainable params: 4,049,571 (15.45 MB)

🚀 Phase 1: Training classifier head

Epoch 1/20

2025-08-24 10:55:54.541328: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 357s 201ms/step - accuracy: 0.8292 - loss: 0.7887 - val_accuracy: 0.8367 - val_loss: 0.5065

Epoch 2/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 351s 201ms/step - accuracy: 0.9231 - loss: 0.3001 - val_accuracy: 0.8489 - val_loss: 0.4491

Epoch 3/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 354s 202ms/step - accuracy: 0.9369 - loss: 0.2314 - val_accuracy: 0.8680 - val_loss: 0.3988

Epoch 4/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 363s 207ms/step - accuracy: 0.9413 - loss: 0.2039 - val_accuracy: 0.8666 - val_loss: 0.3918

Epoch 5/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 401s 229ms/step - accuracy: 0.9484 - loss: 0.1757 - val_accuracy: 0.8661 - val_loss: 0.4093

Epoch 6/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 480s 274ms/step - accuracy: 0.9478 - loss: 0.1723 - val_accuracy: 0.8796 - val_loss: 0.3748

Epoch 7/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 382s 218ms/step - accuracy: 0.9498 - loss: 0.1622 - val_accuracy: 0.8844 - val_loss: 0.3458

Epoch 8/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 364s 208ms/step - accuracy: 0.9524 - loss: 0.1521 - val_accuracy: 0.8744 - val_loss: 0.3995

Epoch 9/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 361s 206ms/step - accuracy: 0.9527 - loss: 0.1473 - val_accuracy: 0.8894 - val_loss: 0.3432

Epoch 10/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 401s 229ms/step - accuracy: 0.9573 - loss: 0.1403 - val_accuracy: 0.8930 - val_loss: 0.3206

Epoch 11/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 403s 230ms/step - accuracy: 0.9549 - loss: 0.1396 - val_accuracy: 0.8853 - val_loss: 0.3568

Epoch 12/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 400s 229ms/step - accuracy: 0.9558 - loss: 0.1361 - val_accuracy: 0.8936 - val_loss: 0.3324

Epoch 13/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 399s 228ms/step - accuracy: 0.9565 - loss: 0.1348 - val_accuracy: 0.8954 - val_loss: 0.3220

Epoch 14/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 401s 229ms/step - accuracy: 0.9564 - loss: 0.1350 - val_accuracy: 0.9009 - val_loss: 0.2988

Epoch 15/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 400s 229ms/step - accuracy: 0.9575 - loss: 0.1335 - val_accuracy: 0.9059 - val_loss: 0.2784

Epoch 16/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 346s 198ms/step - accuracy: 0.9581 - loss: 0.1300 - val_accuracy: 0.8990 - val_loss: 0.3237

Epoch 17/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 342s 195ms/step - accuracy: 0.9590 - loss: 0.1260 - val_accuracy: 0.8816 - val_loss: 0.3762

Epoch 18/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 344s 196ms/step - accuracy: 0.9599 - loss: 0.1224 - val_accuracy: 0.8989 - val_loss: 0.3151

Epoch 19/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 343s 196ms/step - accuracy: 0.9590 - loss: 0.1257 - val_accuracy: 0.9044 - val_loss: 0.2896

Epoch 20/20

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 344s 196ms/step - accuracy: 0.9579 - loss: 0.1261 - val_accuracy: 0.9111 - val_loss: 0.2622

🚀 Phase 2: Fine-tuning last 20 layers

Epoch 1/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 397s 221ms/step - accuracy: 0.7774 - loss: 0.8018 - val_accuracy: 0.9043 - val_loss: 0.3364

Epoch 2/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 399s 228ms/step - accuracy: 0.8726 - loss: 0.4119 - val_accuracy: 0.9297 - val_loss: 0.2536

Epoch 3/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 385s 220ms/step - accuracy: 0.9009 - loss: 0.3149 - val_accuracy: 0.9349 - val_loss: 0.2383

Epoch 4/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 381s 218ms/step - accuracy: 0.9183 - loss: 0.2574 - val_accuracy: 0.9411 - val_loss: 0.2193

Epoch 5/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 384s 219ms/step - accuracy: 0.9282 - loss: 0.2232 - val_accuracy: 0.9423 - val_loss: 0.2088

Epoch 6/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 381s 218ms/step - accuracy: 0.9343 - loss: 0.2081 - val_accuracy: 0.9474 - val_loss: 0.1850

Epoch 7/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 383s 219ms/step - accuracy: 0.9419 - loss: 0.1797 - val_accuracy: 0.9473 - val_loss: 0.1850

Epoch 8/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 380s 217ms/step - accuracy: 0.9475 - loss: 0.1655 - val_accuracy: 0.9491 - val_loss: 0.1837

Epoch 9/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 382s 218ms/step - accuracy: 0.9490 - loss: 0.1581 - val_accuracy: 0.9516 - val_loss: 0.1717

Epoch 10/10

1750/1750 ━━━━━━━━━━━━━━━━━━━━ 381s 218ms/step - accuracy: 0.9530 - loss: 0.1425 - val_accuracy: 0.9500 - val_loss: 0.1753

import tensorflow as tf
import numpy as np
import json
import os
from tensorflow.keras.preprocessing import image
import matplotlib.pyplot as plt

# --- Paths ---
model_path = "drill_core_model_rgb.keras"
classes_path = "class_names.json"
image_path = "../drill_core/DCID-512-35/test/1.Red sandstone/1.Red sandstone.jpg"  # ← Replace or pass as input
# Example: image_path = "../drill_core/DCID-512-35/test/class_name/sample.jpg"

# --- Load class names ---
if not os.path.exists(classes_path):
    raise FileNotFoundError(f"Class names file not found: {classes_path}")
with open(classes_path, "r") as f:
    class_names = json.load(f)
print("Loaded classes:", class_names)

# --- Load model ---
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model file not found: {model_path}")
print("Loading model...")
model = tf.keras.models.load_model(model_path)
print("✅ Model loaded successfully!")

# --- Image preprocessing function ---
def preprocess_image(img_path, target_size=(224, 224)):
    """Load and preprocess a single image for inference."""
    if not os.path.exists(img_path):
        raise FileNotFoundError(f"Image not found: {img_path}")
    
    # Load image
    img = image.load_img(img_path, target_size=target_size)  # Resize to 224x224
    img_array = image.img_to_array(img)                     # To numpy array
    img_array = tf.expand_dims(img_array, 0)                # Add batch dim (1, 224, 224, 3)
    img_array = tf.keras.applications.efficientnet.preprocess_input(img_array)  # Preprocess
    return img, img_array

# --- Prediction function ---
def predict_image(img_path, model, class_names, show=True):
    """Run inference on a single image."""
    print(f"\n📌 Predicting: {os.path.basename(img_path)}")
    
    # Preprocess
    img, img_array = preprocess_image(img_path)
    
    # Predict
    predictions = model.predict(img_array, verbose=0)
    predicted_class_idx = np.argmax(predictions[0])
    predicted_class = class_names[predicted_class_idx]
    confidence = predictions[0][predicted_class_idx]
    
    # Get top-3 predictions
    top_indices = np.argsort(predictions[0])[::-1][:3]
    print("Top 3 predictions:")
    for i in top_indices:
        print(f"  {class_names[i]}: {predictions[0][i]:.4f}")
    
    # Show image and prediction
    if show:
        plt.figure(figsize=(6, 6))
        plt.imshow(img)
        plt.title(f"Predicted: {predicted_class}\nConfidence: {confidence:.4f}", fontsize=14)
        plt.axis("off")
        plt.tight_layout()
        plt.show()
    
    return predicted_class, confidence

# --- Batch prediction on a folder ---
def predict_folder(folder_path, model, class_names, max_images=10):
    """Run inference on all images in a folder."""
    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}")
        return
    
    print(f"\n📁 Predicting images in: {folder_path}")
    count = 0
    for root, _, files in os.walk(folder_path):
        for f in files:
            if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                img_path = os.path.join(root, f)
                predict_image(img_path, model, class_names, show=True)
                count += 1
                if count >= max_images:
                    print(f"\n🛑 Stopped after {max_images} images.")
                    return

# --- Option 1: Predict single image ---
single_image_path = './immagini/arenaria-marina-roccia-sedimentaria-campione-by2dwg-592163397.jpg'

if os.path.exists(single_image_path):
    predict_image(single_image_path, model, class_names, show=True)
else:
    print(f"⚠️ Image not found: {single_image_path}. Update 'single_image_path'.")

# --- Option 2: Predict multiple images from a folder ---
# Uncomment to run batch prediction
# predict_folder("../drill_core/DCID-512-35/test", model, class_names, max_images=5)

venerdì 22 agosto 2025

Persistence of tensor su nuvole di punti

Questo post testa il metodo presentato nell'articolo Line Structure Extraction from LiDAR Point Cloud Based on the Persistence of Tensor Feature Wang, X.; Lyu, H.; He, W.; Chen, Q. Line Structure Extraction from LiDAR Point Cloud Based on the Persistence of Tensor Feature. Appl. Sci. 2022, 12, 9190. https://doi.org/10.3390/app12189190

Il metodo permette di estrarre lineazioni, piani da una nuvola di punti (in pratica una generalizzazione dei RANSAC ma in questo caso non si riesce a discriminare tra piani a differente orientazione)

L'algoritmo e' stato testato sulla nuvola di punti derivante da questo modello

L'unico parametro del metodo e' il radius a riga 55

        [k, idx, _] = pcd_tree.search_knn_vector_3d(pcd.points[i], 100)

Esempio su dati di una scala (dataset compreso nei file di esempio)

import open3d as o3d
import numpy as np

def extract_tensor_features(pcd_path, neighborhood_radius=0.05):
    """
    Extracts linear, planar, and spherical features from a point cloud using
    the Tensor Feature method.

    Args:
        pcd_path (str): The path to the input.ply file.
        neighborhood_radius (float): The radius for neighborhood search.
    """
    # Step 1: Load the point cloud
    try:
        pcd = o3d.io.read_point_cloud(pcd_path)
        if len(pcd.points) == 0:
            print(f"Error: The point cloud file at {pcd_path} is empty.")
            return
        print(f"Loaded point cloud with {len(pcd.points)} points from {pcd_path}")
    except Exception as e:
        print(f"Error loading point cloud: {e}")
        return

    # Step 2: Estimate normals, which is necessary for tensor encoding
    pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamRadius(radius=neighborhood_radius))
    
    # CRITICAL CHECK: Ensure normals were successfully created
    if not pcd.has_normals():
        print("\nNormal estimation failed. This could be due to:")
        print("- The point cloud is empty or contains only a few points.")
        print("- The 'neighborhood_radius' is too small for the point cloud's density.")
        print("Please check your input file or try increasing the 'neighborhood_radius' parameter.")
        return

    pcd.orient_normals_consistent_tangent_plane(k=10)
    print("Normals estimated and oriented.")

    # Create a KDTree for efficient neighborhood searches
    pcd_tree = o3d.geometry.KDTreeFlann(pcd)

    # Prepare arrays to store classification results
    points = np.asarray(pcd.points)
    num_points = len(points)
    classification_labels = np.zeros(num_points, dtype=int)
    colors = np.zeros((num_points, 3))

    # Define colors for visualization
    LINEAR_COLOR = np.array((1.0, 0.0, 0.0))      # Red for linear features (e.g., edges, lines)
    PLANAR_COLOR = np.array((0.0, 1.0, 0.0))      # Green for planar features (e.g., surfaces, walls)
    SPHERICAL_COLOR = np.array((0.0, 0.0, 1.0))   # Blue for spherical features (e.g., noise, isolated points)

    # Step 3 & 4: Compute tensor, decompose, and classify
    for i in range(num_points):
        # Find neighbors within the specified radius
        [k, idx, _] = pcd_tree.search_knn_vector_3d(pcd.points[i], 100)
        
        # If not enough neighbors, skip the point
        if len(idx) < 10:
            continue
            
        neighbors = points[idx, :]
        
        # Compute the covariance tensor
        center = np.mean(neighbors, axis=0)
        cov_tensor = np.cov((neighbors - center).T)

        # Eigen-decomposition
        eigenvalues, _ = np.linalg.eig(cov_tensor)
        eigenvalues = np.sort(eigenvalues)[::-1]  # Sort descending

        lambda1, lambda2, lambda3 = eigenvalues

        # Calculate shape features based on eigenvalues
        # Linear feature: one large eigenvalue
        linearity = (lambda1 - lambda2) / lambda1 if lambda1 > 0 else 0
        # Planar feature: two large eigenvalues
        planarity = (lambda2 - lambda3) / lambda1 if lambda1 > 0 else 0
        # Spherical feature: all eigenvalues are similar
        sphericity = lambda3 / lambda1 if lambda1 > 0 else 0

        # Classify the point based on the dominant feature
        if linearity > 0.8:  # Heuristic threshold for linearity
            classification_labels[i] = 1
            colors[i] = LINEAR_COLOR
        elif planarity > 0.8: # Heuristic threshold for planarity
            classification_labels[i] = 2
            colors[i] = PLANAR_COLOR
        else:
            classification_labels[i] = 3
            colors[i] = SPHERICAL_COLOR
    
    # Step 5: Visualize the results
    pcd.colors = o3d.utility.Vector3dVector(colors)
    o3d.visualization.draw_geometries([pcd], window_name="Point Cloud Feature Extraction")
    
    # Print a summary of the results
    linear_points = np.sum(classification_labels == 1)
    planar_points = np.sum(classification_labels == 2)
    spherical_points = np.sum(classification_labels == 3)
    
    print("\n--- Classification Summary ---")
    print(f"Linear Features: {linear_points} points ({linear_points/num_points*100:.2f}%)")
    print(f"Planar Features: {planar_points} points ({planar_points/num_points*100:.2f}%)")
    print(f"Spherical Features: {spherical_points} points ({spherical_points/num_points*100:.2f}%)")

if __name__ == "__main__":
    file_path = "./output_depth/pointcloud.ply"
    extract_tensor_features(file_path)

DepthAnything Metric

DepthAnything e' un modello (o meglio piu' modelli a seconda del training set) per monocular depth estimation. Il vantaggio rispetto a Midas che ha un set di addestramento outdoor che fornisce risposte di profondita' metriche invece che relative

Il dataset di training e' Kitti quindi ambiente urbano specifico per guida autonoma ma vediamo come si comporta con affioramenti naturali di roccia (avendone la possibilita' DepthAnything potrebbe essere esteso al proprio dataset)

Immagine RGB di partenza

Mappa di profondita'

from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import torch
import numpy as np
from PIL import Image
import cv2
import os

# Configuration
model_name = "depth-anything/Depth-Anything-V2-Metric-Outdoor-Large-hf"
# Or: "depth-anything/Depth-Anything-V2-Large" (relative)
input_image_path = "./images/rgb_0001.png"
output_dir = "output_depth"
os.makedirs(output_dir, exist_ok=True)

# Load image
image = Image.open(input_image_path).convert("RGB")

# Load model and processor
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForDepthEstimation.from_pretrained(model_name)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Inference
with torch.no_grad():
    inputs = image_processor(images=image, return_tensors="pt").to(device)
    outputs = model(**inputs)
    
    # Check which attribute exists and use the correct one
    if hasattr(outputs, 'predicted_depth'):
        depth_tensor = outputs.predicted_depth
    elif hasattr(outputs, 'depth'):
        depth_tensor = outputs.depth
    else:
        # Fallback: get the first tensor from outputs
        depth_tensor = outputs.last_hidden_state if hasattr(outputs, 'last_hidden_state') else list(outputs.values())[0]
    
    # Manual post-processing since the built-in method expects 'predicted_depth'
    # Get the depth tensor and resize it to original image size
    depth_map = depth_tensor.squeeze().cpu()
    
    # Resize depth map to original image size
    original_size = (image.height, image.width)
    if depth_map.shape != original_size:
        depth_map_np = depth_map.numpy()
        depth_map_np = cv2.resize(depth_map_np, (image.width, image.height), interpolation=cv2.INTER_LINEAR)
        depth_map = torch.from_numpy(depth_map_np)
    
    # Put it in a list to match expected format
    depth_maps = [depth_map]

# Get numpy depth map (already resized to original image size)
depth_map = depth_maps[0].numpy()  # (H, W)

# Normalize for visualization
depth_vis = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
depth_vis = (depth_vis * 255).astype(np.uint8)
depth_color = cv2.applyColorMap(depth_vis, cv2.COLORMAP_INFERNO)

# Save outputs
cv2.imwrite(os.path.join(output_dir, "depth_color.png"), depth_color)
cv2.imwrite(os.path.join(output_dir, "depth_grayscale.png"), depth_vis)
np.save(os.path.join(output_dir, "depth_map.npy"), depth_map)

# Save original image for comparison
original_array = np.array(image)
cv2.imwrite(os.path.join(output_dir, "original.png"), cv2.cvtColor(original_array, cv2.COLOR_RGB2BGR))

print(f"✅ Depth map saved. Shape: {depth_map.shape}")
print(f"📊 Depth range: {depth_map.min():.3f} to {depth_map.max():.3f}")
print(f"💾 Files saved in: {output_dir}/")
print(f"   - depth_color.png (colorized depth map)")
print(f"   - depth_grayscale.png (grayscale depth map)")
print(f"   - depth_map.npy (raw numpy array)")
print(f"   - original.png (original image for comparison)")

Creazione di un una nuvola di punti con i parametri specifici della mia Realsense D415

(fov, cx,cy,fx ed fy)

import numpy as np
from PIL import Image
import open3d as o3d
import os

def create_pointcloud_from_files(depth_path, rgb_path, output_path, 
                                fov_horizontal=60, max_depth=10.0):
    """
    Create a PLY point cloud from depth map and RGB image files
    
    Args:
        depth_path: path to .npy depth map file
        rgb_path: path to RGB image file
        output_path: path to save the .ply file
        fov_horizontal: horizontal field of view in degrees
        max_depth: maximum depth to include in meters
    """
    
    # Load depth map
    print(f"Loading depth map from: {depth_path}")
    depth_map = np.load(depth_path)
    print(f"Depth map shape: {depth_map.shape}")
    print(f"Depth range: {depth_map.min():.3f} to {depth_map.max():.3f} meters")
    
    # Load RGB image
    print(f"Loading RGB image from: {rgb_path}")
    rgb_image = Image.open(rgb_path).convert('RGB')
    rgb_array = np.array(rgb_image)
    print(f"RGB image shape: {rgb_array.shape}")
    
    # Ensure RGB and depth have same dimensions
    if rgb_array.shape[:2] != depth_map.shape:
        print(f"Resizing RGB {rgb_array.shape[:2]} to match depth {depth_map.shape}")
        rgb_image = rgb_image.resize((depth_map.shape[1], depth_map.shape[0]))
        rgb_array = np.array(rgb_image)
    
    height, width = depth_map.shape
    
    # Estimate camera intrinsics from FOV
    fov_rad = np.radians(fov_horizontal)
    fx = 616.6863403320312
    fy =  616.5963134765625
    cx = 318.4041748046875
    cy = 238.97064208984375
    
    print(f"Estimated camera intrinsics:")
    print(f"  fx={fx:.1f}, fy={fy:.1f}")
    print(f"  cx={cx:.1f}, cy={cy:.1f}")
    print(f"  FOV: {fov_horizontal}°")
    
    # Create coordinate grids
    x, y = np.meshgrid(np.arange(width), np.arange(height))
    
    # Convert to 3D coordinates using pinhole camera model
    z = depth_map
    x_3d = (x - cx) * z / fx
    y_3d = (y - cy) * z / fy
    
    # Filter valid depths
    valid_mask = (z > 0) & (z < max_depth) & np.isfinite(z)
    num_valid = np.sum(valid_mask)
    print(f"Valid points: {num_valid:,} / {depth_map.size:,} ({100*num_valid/depth_map.size:.1f}%)")
    
    # Extract valid points and colors
    points_3d = np.stack([
        x_3d[valid_mask], 
        -y_3d[valid_mask],  # Flip Y for correct orientation
        z[valid_mask]
    ], axis=1)
    
    colors = rgb_array[valid_mask] / 255.0  # Normalize to [0, 1]
    
    # Create Open3D point cloud
    print("Creating point cloud...")
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(points_3d)
    pcd.colors = o3d.utility.Vector3dVector(colors)
    
    # Optional: Remove statistical outliers
    print("Removing outliers...")
    pcd_filtered, outlier_indices = pcd.remove_statistical_outlier(
        nb_neighbors=20, std_ratio=2.0
    )
    print(f"Removed {len(outlier_indices):,} outliers")
    print(f"Final point cloud: {len(pcd_filtered.points):,} points")
    
    # Save point cloud
    print(f"Saving point cloud to: {output_path}")
    success = o3d.io.write_point_cloud(output_path, pcd_filtered)
    
    if success:
        print("✅ Point cloud saved successfully!")
        
        # Print file size
        file_size = os.path.getsize(output_path) / (1024 * 1024)  # MB
        print(f"📁 File size: {file_size:.2f} MB")
        
        return pcd_filtered
    else:
        print("❌ Failed to save point cloud")
        return None

def visualize_pointcloud(pcd):
    """Visualize the point cloud"""
    try:
        print("\n🔍 Visualizing point cloud...")
        print("   - Use mouse to rotate, scroll to zoom")
        print("   - Close the window when done")
        o3d.visualization.draw_geometries([pcd])
    except Exception as e:
        print(f"⚠️ Visualization failed: {e}")
        print("   You can open the .ply file in MeshLab, CloudCompare, or Blender")

if __name__ == "__main__":
    # File paths
    depth_file = "output_depth/depth_map.npy"
    rgb_file = "output_depth/original.png"
    output_file = "output_depth/pointcloud.ply"
    
    # Check if files exist
    if not os.path.exists(depth_file):
        print(f"❌ Depth file not found: {depth_file}")
        exit(1)
        
    if not os.path.exists(rgb_file):
        print(f"❌ RGB file not found: {rgb_file}")
        exit(1)
    
    # Create point cloud
    pcd = create_pointcloud_from_files(
        depth_path=depth_file,
        rgb_path=rgb_file,
        output_path=output_file,
        fov_horizontal=69.4,  # Adjust this based on your camera
        max_depth=15.0      # Adjust this to filter distant points
    )
    
    if pcd is not None:
        # Optional: visualize
        visualize_pointcloud(pcd)