DepthAnything e' un modello (o meglio piu' modelli a seconda del training set) per monocular depth estimation. Il vantaggio rispetto a Midas che ha un set di addestramento outdoor che fornisce risposte di profondita' metriche invece che relative
Il dataset di training e' Kitti quindi ambiente urbano specifico per guida autonoma ma vediamo come si comporta con affioramenti naturali di roccia (avendone la possibilita' DepthAnything potrebbe essere esteso al proprio dataset)
![]() |
| Immagine RGB di partenza |
![]() |
| Mappa di profondita' |
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import torch
import numpy as np
from PIL import Image
import cv2
import os
# Configuration
model_name = "depth-anything/Depth-Anything-V2-Metric-Outdoor-Large-hf"
# Or: "depth-anything/Depth-Anything-V2-Large" (relative)
input_image_path = "./images/rgb_0001.png"
output_dir = "output_depth"
os.makedirs(output_dir, exist_ok=True)
# Load image
image = Image.open(input_image_path).convert("RGB")
# Load model and processor
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForDepthEstimation.from_pretrained(model_name)
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
# Inference
with torch.no_grad():
inputs = image_processor(images=image, return_tensors="pt").to(device)
outputs = model(**inputs)
# Check which attribute exists and use the correct one
if hasattr(outputs, 'predicted_depth'):
depth_tensor = outputs.predicted_depth
elif hasattr(outputs, 'depth'):
depth_tensor = outputs.depth
else:
# Fallback: get the first tensor from outputs
depth_tensor = outputs.last_hidden_state if hasattr(outputs, 'last_hidden_state') else list(outputs.values())[0]
# Manual post-processing since the built-in method expects 'predicted_depth'
# Get the depth tensor and resize it to original image size
depth_map = depth_tensor.squeeze().cpu()
# Resize depth map to original image size
original_size = (image.height, image.width)
if depth_map.shape != original_size:
depth_map_np = depth_map.numpy()
depth_map_np = cv2.resize(depth_map_np, (image.width, image.height), interpolation=cv2.INTER_LINEAR)
depth_map = torch.from_numpy(depth_map_np)
# Put it in a list to match expected format
depth_maps = [depth_map]
# Get numpy depth map (already resized to original image size)
depth_map = depth_maps[0].numpy() # (H, W)
# Normalize for visualization
depth_vis = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
depth_vis = (depth_vis * 255).astype(np.uint8)
depth_color = cv2.applyColorMap(depth_vis, cv2.COLORMAP_INFERNO)
# Save outputs
cv2.imwrite(os.path.join(output_dir, "depth_color.png"), depth_color)
cv2.imwrite(os.path.join(output_dir, "depth_grayscale.png"), depth_vis)
np.save(os.path.join(output_dir, "depth_map.npy"), depth_map)
# Save original image for comparison
original_array = np.array(image)
cv2.imwrite(os.path.join(output_dir, "original.png"), cv2.cvtColor(original_array, cv2.COLOR_RGB2BGR))
print(f"✅ Depth map saved. Shape: {depth_map.shape}")
print(f"📊 Depth range: {depth_map.min():.3f} to {depth_map.max():.3f}")
print(f"💾 Files saved in: {output_dir}/")
print(f" - depth_color.png (colorized depth map)")
print(f" - depth_grayscale.png (grayscale depth map)")
print(f" - depth_map.npy (raw numpy array)")
print(f" - original.png (original image for comparison)")
Creazione di un una nuvola di punti con i parametri specifici della mia Realsense D415
(fov, cx,cy,fx ed fy)
import numpy as np
from PIL import Image
import open3d as o3d
import os
def create_pointcloud_from_files(depth_path, rgb_path, output_path,
fov_horizontal=60, max_depth=10.0):
"""
Create a PLY point cloud from depth map and RGB image files
Args:
depth_path: path to .npy depth map file
rgb_path: path to RGB image file
output_path: path to save the .ply file
fov_horizontal: horizontal field of view in degrees
max_depth: maximum depth to include in meters
"""
# Load depth map
print(f"Loading depth map from: {depth_path}")
depth_map = np.load(depth_path)
print(f"Depth map shape: {depth_map.shape}")
print(f"Depth range: {depth_map.min():.3f} to {depth_map.max():.3f} meters")
# Load RGB image
print(f"Loading RGB image from: {rgb_path}")
rgb_image = Image.open(rgb_path).convert('RGB')
rgb_array = np.array(rgb_image)
print(f"RGB image shape: {rgb_array.shape}")
# Ensure RGB and depth have same dimensions
if rgb_array.shape[:2] != depth_map.shape:
print(f"Resizing RGB {rgb_array.shape[:2]} to match depth {depth_map.shape}")
rgb_image = rgb_image.resize((depth_map.shape[1], depth_map.shape[0]))
rgb_array = np.array(rgb_image)
height, width = depth_map.shape
# Estimate camera intrinsics from FOV
fov_rad = np.radians(fov_horizontal)
fx = 616.6863403320312
fy = 616.5963134765625
cx = 318.4041748046875
cy = 238.97064208984375
print(f"Estimated camera intrinsics:")
print(f" fx={fx:.1f}, fy={fy:.1f}")
print(f" cx={cx:.1f}, cy={cy:.1f}")
print(f" FOV: {fov_horizontal}°")
# Create coordinate grids
x, y = np.meshgrid(np.arange(width), np.arange(height))
# Convert to 3D coordinates using pinhole camera model
z = depth_map
x_3d = (x - cx) * z / fx
y_3d = (y - cy) * z / fy
# Filter valid depths
valid_mask = (z > 0) & (z < max_depth) & np.isfinite(z)
num_valid = np.sum(valid_mask)
print(f"Valid points: {num_valid:,} / {depth_map.size:,} ({100*num_valid/depth_map.size:.1f}%)")
# Extract valid points and colors
points_3d = np.stack([
x_3d[valid_mask],
-y_3d[valid_mask], # Flip Y for correct orientation
z[valid_mask]
], axis=1)
colors = rgb_array[valid_mask] / 255.0 # Normalize to [0, 1]
# Create Open3D point cloud
print("Creating point cloud...")
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(points_3d)
pcd.colors = o3d.utility.Vector3dVector(colors)
# Optional: Remove statistical outliers
print("Removing outliers...")
pcd_filtered, outlier_indices = pcd.remove_statistical_outlier(
nb_neighbors=20, std_ratio=2.0
)
print(f"Removed {len(outlier_indices):,} outliers")
print(f"Final point cloud: {len(pcd_filtered.points):,} points")
# Save point cloud
print(f"Saving point cloud to: {output_path}")
success = o3d.io.write_point_cloud(output_path, pcd_filtered)
if success:
print("✅ Point cloud saved successfully!")
# Print file size
file_size = os.path.getsize(output_path) / (1024 * 1024) # MB
print(f"📁 File size: {file_size:.2f} MB")
return pcd_filtered
else:
print("❌ Failed to save point cloud")
return None
def visualize_pointcloud(pcd):
"""Visualize the point cloud"""
try:
print("\n🔍 Visualizing point cloud...")
print(" - Use mouse to rotate, scroll to zoom")
print(" - Close the window when done")
o3d.visualization.draw_geometries([pcd])
except Exception as e:
print(f"⚠️ Visualization failed: {e}")
print(" You can open the .ply file in MeshLab, CloudCompare, or Blender")
if __name__ == "__main__":
# File paths
depth_file = "output_depth/depth_map.npy"
rgb_file = "output_depth/original.png"
output_file = "output_depth/pointcloud.ply"
# Check if files exist
if not os.path.exists(depth_file):
print(f"❌ Depth file not found: {depth_file}")
exit(1)
if not os.path.exists(rgb_file):
print(f"❌ RGB file not found: {rgb_file}")
exit(1)
# Create point cloud
pcd = create_pointcloud_from_files(
depth_path=depth_file,
rgb_path=rgb_file,
output_path=output_file,
fov_horizontal=69.4, # Adjust this based on your camera
max_depth=15.0 # Adjust this to filter distant points
)
if pcd is not None:
# Optional: visualize
visualize_pointcloud(pcd)


Nessun commento:
Posta un commento