DHOB (IU5SGN): DepthAnything Metric

venerdì 22 agosto 2025

DepthAnything Metric

DepthAnything e' un modello (o meglio piu' modelli a seconda del training set) per monocular depth estimation. Il vantaggio rispetto a Midas che ha un set di addestramento outdoor che fornisce risposte di profondita' metriche invece che relative

Il dataset di training e' Kitti quindi ambiente urbano specifico per guida autonoma ma vediamo come si comporta con affioramenti naturali di roccia (avendone la possibilita' DepthAnything potrebbe essere esteso al proprio dataset)

Immagine RGB di partenza

Mappa di profondita'

from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import torch
import numpy as np
from PIL import Image
import cv2
import os

# Configuration
model_name = "depth-anything/Depth-Anything-V2-Metric-Outdoor-Large-hf"
# Or: "depth-anything/Depth-Anything-V2-Large" (relative)
input_image_path = "./images/rgb_0001.png"
output_dir = "output_depth"
os.makedirs(output_dir, exist_ok=True)

# Load image
image = Image.open(input_image_path).convert("RGB")

# Load model and processor
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForDepthEstimation.from_pretrained(model_name)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Inference
with torch.no_grad():
    inputs = image_processor(images=image, return_tensors="pt").to(device)
    outputs = model(**inputs)
    
    # Check which attribute exists and use the correct one
    if hasattr(outputs, 'predicted_depth'):
        depth_tensor = outputs.predicted_depth
    elif hasattr(outputs, 'depth'):
        depth_tensor = outputs.depth
    else:
        # Fallback: get the first tensor from outputs
        depth_tensor = outputs.last_hidden_state if hasattr(outputs, 'last_hidden_state') else list(outputs.values())[0]
    
    # Manual post-processing since the built-in method expects 'predicted_depth'
    # Get the depth tensor and resize it to original image size
    depth_map = depth_tensor.squeeze().cpu()
    
    # Resize depth map to original image size
    original_size = (image.height, image.width)
    if depth_map.shape != original_size:
        depth_map_np = depth_map.numpy()
        depth_map_np = cv2.resize(depth_map_np, (image.width, image.height), interpolation=cv2.INTER_LINEAR)
        depth_map = torch.from_numpy(depth_map_np)
    
    # Put it in a list to match expected format
    depth_maps = [depth_map]

# Get numpy depth map (already resized to original image size)
depth_map = depth_maps[0].numpy()  # (H, W)

# Normalize for visualization
depth_vis = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
depth_vis = (depth_vis * 255).astype(np.uint8)
depth_color = cv2.applyColorMap(depth_vis, cv2.COLORMAP_INFERNO)

# Save outputs
cv2.imwrite(os.path.join(output_dir, "depth_color.png"), depth_color)
cv2.imwrite(os.path.join(output_dir, "depth_grayscale.png"), depth_vis)
np.save(os.path.join(output_dir, "depth_map.npy"), depth_map)

# Save original image for comparison
original_array = np.array(image)
cv2.imwrite(os.path.join(output_dir, "original.png"), cv2.cvtColor(original_array, cv2.COLOR_RGB2BGR))

print(f"✅ Depth map saved. Shape: {depth_map.shape}")
print(f"📊 Depth range: {depth_map.min():.3f} to {depth_map.max():.3f}")
print(f"💾 Files saved in: {output_dir}/")
print(f"   - depth_color.png (colorized depth map)")
print(f"   - depth_grayscale.png (grayscale depth map)")
print(f"   - depth_map.npy (raw numpy array)")
print(f"   - original.png (original image for comparison)")

Creazione di un una nuvola di punti con i parametri specifici della mia Realsense D415

(fov, cx,cy,fx ed fy)

import numpy as np
from PIL import Image
import open3d as o3d
import os

def create_pointcloud_from_files(depth_path, rgb_path, output_path, 
                                fov_horizontal=60, max_depth=10.0):
    """
    Create a PLY point cloud from depth map and RGB image files
    
    Args:
        depth_path: path to .npy depth map file
        rgb_path: path to RGB image file
        output_path: path to save the .ply file
        fov_horizontal: horizontal field of view in degrees
        max_depth: maximum depth to include in meters
    """
    
    # Load depth map
    print(f"Loading depth map from: {depth_path}")
    depth_map = np.load(depth_path)
    print(f"Depth map shape: {depth_map.shape}")
    print(f"Depth range: {depth_map.min():.3f} to {depth_map.max():.3f} meters")
    
    # Load RGB image
    print(f"Loading RGB image from: {rgb_path}")
    rgb_image = Image.open(rgb_path).convert('RGB')
    rgb_array = np.array(rgb_image)
    print(f"RGB image shape: {rgb_array.shape}")
    
    # Ensure RGB and depth have same dimensions
    if rgb_array.shape[:2] != depth_map.shape:
        print(f"Resizing RGB {rgb_array.shape[:2]} to match depth {depth_map.shape}")
        rgb_image = rgb_image.resize((depth_map.shape[1], depth_map.shape[0]))
        rgb_array = np.array(rgb_image)
    
    height, width = depth_map.shape
    
    # Estimate camera intrinsics from FOV
    fov_rad = np.radians(fov_horizontal)
    fx = 616.6863403320312
    fy =  616.5963134765625
    cx = 318.4041748046875
    cy = 238.97064208984375
    
    print(f"Estimated camera intrinsics:")
    print(f"  fx={fx:.1f}, fy={fy:.1f}")
    print(f"  cx={cx:.1f}, cy={cy:.1f}")
    print(f"  FOV: {fov_horizontal}°")
    
    # Create coordinate grids
    x, y = np.meshgrid(np.arange(width), np.arange(height))
    
    # Convert to 3D coordinates using pinhole camera model
    z = depth_map
    x_3d = (x - cx) * z / fx
    y_3d = (y - cy) * z / fy
    
    # Filter valid depths
    valid_mask = (z > 0) & (z < max_depth) & np.isfinite(z)
    num_valid = np.sum(valid_mask)
    print(f"Valid points: {num_valid:,} / {depth_map.size:,} ({100*num_valid/depth_map.size:.1f}%)")
    
    # Extract valid points and colors
    points_3d = np.stack([
        x_3d[valid_mask], 
        -y_3d[valid_mask],  # Flip Y for correct orientation
        z[valid_mask]
    ], axis=1)
    
    colors = rgb_array[valid_mask] / 255.0  # Normalize to [0, 1]
    
    # Create Open3D point cloud
    print("Creating point cloud...")
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(points_3d)
    pcd.colors = o3d.utility.Vector3dVector(colors)
    
    # Optional: Remove statistical outliers
    print("Removing outliers...")
    pcd_filtered, outlier_indices = pcd.remove_statistical_outlier(
        nb_neighbors=20, std_ratio=2.0
    )
    print(f"Removed {len(outlier_indices):,} outliers")
    print(f"Final point cloud: {len(pcd_filtered.points):,} points")
    
    # Save point cloud
    print(f"Saving point cloud to: {output_path}")
    success = o3d.io.write_point_cloud(output_path, pcd_filtered)
    
    if success:
        print("✅ Point cloud saved successfully!")
        
        # Print file size
        file_size = os.path.getsize(output_path) / (1024 * 1024)  # MB
        print(f"📁 File size: {file_size:.2f} MB")
        
        return pcd_filtered
    else:
        print("❌ Failed to save point cloud")
        return None

def visualize_pointcloud(pcd):
    """Visualize the point cloud"""
    try:
        print("\n🔍 Visualizing point cloud...")
        print("   - Use mouse to rotate, scroll to zoom")
        print("   - Close the window when done")
        o3d.visualization.draw_geometries([pcd])
    except Exception as e:
        print(f"⚠️ Visualization failed: {e}")
        print("   You can open the .ply file in MeshLab, CloudCompare, or Blender")

if __name__ == "__main__":
    # File paths
    depth_file = "output_depth/depth_map.npy"
    rgb_file = "output_depth/original.png"
    output_file = "output_depth/pointcloud.ply"
    
    # Check if files exist
    if not os.path.exists(depth_file):
        print(f"❌ Depth file not found: {depth_file}")
        exit(1)
        
    if not os.path.exists(rgb_file):
        print(f"❌ RGB file not found: {rgb_file}")
        exit(1)
    
    # Create point cloud
    pcd = create_pointcloud_from_files(
        depth_path=depth_file,
        rgb_path=rgb_file,
        output_path=output_file,
        fov_horizontal=69.4,  # Adjust this based on your camera
        max_depth=15.0      # Adjust this to filter distant points
    )
    
    if pcd is not None:
        # Optional: visualize
        visualize_pointcloud(pcd)
        

DHOB (IU5SGN)

venerdì 22 agosto 2025

DepthAnything Metric

Nessun commento:

Posta un commento

Kernel Panic QrCode

Clock

Ricerca

analitcs