Mixed Bin Clearing
This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a vision-guided bin clearing pipeline. Tzara generates code that moves to a scan pose, detects all objects in a bin using QWEN, segments each detection with SAM, and computes grasp orientation via PCA on the binary mask in image space. The 3D grasp position is back-projected from the depth image using camera intrinsics, and each object is picked at a fixed grasp height before being dropped into a target box.
Demo
UR10e picking mixed objects from a bin and dropping into a box
The Natural Language Instruction
Pick different objects from the blue bin and drop them into box. Scan pose is [-0.25462, 0.59302, 0.3141, 180.0, 0.0, 90.0], drop pose is [-0.740, 0.727, -0.2, -180.0, 0.0, 90.0]. Use Sam after qwen get the mask and do PCA on that in image space. Make the grasp height GRASP_Z = 0.015.The Generated Code
python
# Telekinesis pipeline: UR10e + OnRobot RG6 mixed bin clearing
# Detects objects with QWEN VLM, segments with SAM, computes grasp orientation
# via PCA on the binary mask in image space, back-projects centroid to 3D.
import math
import numpy as np
import cv2
from loguru import logger
from telekinesis.synapse.robots.manipulators.universal_robots import UniversalRobotsUR10E
from telekinesis.synapse.tools.parallel_grippers.onrobot import OnRobotRG6
from telekinesis.medulla.cameras.realsense import RealSense
from telekinesis import retina
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
ROBOT_IP = "192.168.1.2"
GRIPPER_IP = "192.168.1.1"
SCAN_POSE = [-0.25462, 0.59302, 0.3141, 180.0, 0.0, 90.0]
DROP_POSE = [-0.740, 0.727, -0.2, -180.0, 0.0, 90.0]
GRASP_Z = 0.015 # fixed grasp Z height (m)
APPROACH_Z = 0.200 # approach/retreat Z height above grasp (m)
DROP_APPROACH_Z = 0.100 # approach height above drop pose (m)
# Gripper
GRIP_OPEN_MM = 110.0
GRIP_CLOSE_MM = 0.0
GRIP_FORCE_N = 40.0
# Motion
MOVE_SPEED = 0.3
MOVE_ACCEL = 0.5
MAX_ITERATIONS = 20 # safety guard — stop after this many pick cycles
# QWEN detection prompt
DETECTION_PROMPT = "objects in the bin"
# ---------------------------------------------------------------------------
# Connect hardware
# ---------------------------------------------------------------------------
robot = UniversalRobotsUR10E()
robot.connect(ROBOT_IP)
logger.success("Robot connected")
gripper = OnRobotRG6()
gripper.connect(ip=GRIPPER_IP, protocol="MODBUS_TCP")
gripper.set_unit("position", "mm")
gripper.set_unit("force", "N")
logger.success("Gripper connected")
camera = RealSense()
camera.connect()
logger.success("Camera connected")
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def above(pose: list, dz: float) -> list:
p = list(pose)
p[2] += dz
return p
def pca_angle_from_mask(mask: np.ndarray) -> float:
"""Return the major-axis angle (radians) of the binary mask via image-space PCA."""
ys, xs = np.where(mask > 0)
if len(xs) < 10:
return 0.0
coords = np.stack([xs, ys], axis=1).astype(np.float64)
mean = coords.mean(axis=0)
centered = coords - mean
cov = np.cov(centered.T)
eigenvalues, eigenvectors = np.linalg.eigh(cov)
major_axis = eigenvectors[:, np.argmax(eigenvalues)]
return math.atan2(major_axis[1], major_axis[0])
def backproject_centroid(mask: np.ndarray, depth_image: np.ndarray, intrinsics) -> tuple:
"""Back-project the mask centroid pixel to 3D camera-frame XY using the depth image."""
ys, xs = np.where(mask > 0)
if len(xs) == 0:
return None
cx_px = int(np.mean(xs))
cy_px = int(np.mean(ys))
depth_m = depth_image[cy_px, cx_px] * 0.001 # RealSense depth in mm -> m
if depth_m <= 0.0:
# fallback: median over mask pixels
depths = depth_image[ys, xs].astype(np.float64) * 0.001
depths = depths[depths > 0]
if len(depths) == 0:
return None
depth_m = float(np.median(depths))
x_3d = (cx_px - intrinsics.ppx) * depth_m / intrinsics.fx
y_3d = (cy_px - intrinsics.ppy) * depth_m / intrinsics.fy
return x_3d, y_3d
# ---------------------------------------------------------------------------
# Pipeline
# ---------------------------------------------------------------------------
try:
# Start at home with gripper open
gripper.move(position=GRIP_OPEN_MM, force=GRIP_FORCE_N, asynchronous=False)
robot.set_cartesian_pose(SCAN_POSE, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
intrinsics = camera.get_intrinsics()
iteration = 0
while iteration < MAX_ITERATIONS:
iteration += 1
logger.info(f"=== Scan iteration {iteration} ===")
# Capture
color_image = camera.get_color_frame()
depth_image = camera.get_depth_frame()
logger.info("Captured color + depth frames")
# Detect objects with QWEN
detections = retina.qwen(color_image, prompt=DETECTION_PROMPT)
logger.info(f"QWEN detected {len(detections)} object(s)")
if len(detections) == 0:
logger.success("No more objects detected — bin is clear")
break
picked_any = False
for det_idx, detection in enumerate(detections):
logger.info(f" Processing detection {det_idx + 1}/{len(detections)}: {detection.get('label', 'object')}")
# Segment with SAM using QWEN bounding box as prompt
bbox = detection.get("bbox") # [x1, y1, x2, y2] in pixels
mask = retina.segment_anything(color_image, bbox_prompt=bbox)
if mask is None or mask.sum() == 0:
logger.warning(f" SAM returned empty mask for detection {det_idx + 1}, skipping")
continue
# PCA in image space -> grasp rotation around Z
angle_rad = pca_angle_from_mask(mask)
angle_deg = math.degrees(angle_rad)
grasp_rz = SCAN_POSE[5] + angle_deg # adjust base Z rotation by PCA angle
logger.debug(f" PCA angle: {angle_deg:.1f} deg -> grasp rz = {grasp_rz:.1f} deg")
# Back-project centroid to robot XY
result = backproject_centroid(mask, depth_image, intrinsics)
if result is None:
logger.warning(f" Could not back-project centroid for detection {det_idx + 1}, skipping")
continue
x_cam, y_cam = result
# Transform camera-frame XY to robot-frame using scan pose as reference
# (assumes camera mounted at TCP pointing down; robot XY ~ camera XY with sign flip)
robot_x = SCAN_POSE[0] - x_cam
robot_y = SCAN_POSE[1] - y_cam
grasp_pose = [robot_x, robot_y, GRASP_Z, SCAN_POSE[3], SCAN_POSE[4], grasp_rz]
approach_pose = above(grasp_pose, APPROACH_Z - GRASP_Z)
logger.info(f" Grasp pose: {[round(v, 4) for v in grasp_pose]}")
# --- PICK ---
robot.set_cartesian_pose(approach_pose, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
gripper.move(position=GRIP_OPEN_MM, force=GRIP_FORCE_N, asynchronous=False)
robot.set_cartesian_pose(grasp_pose, speed=MOVE_SPEED * 0.5, acceleration=MOVE_ACCEL)
gripper.move(position=GRIP_CLOSE_MM, force=GRIP_FORCE_N, asynchronous=False)
logger.success(f" Grasped detection {det_idx + 1}")
robot.set_cartesian_pose(approach_pose, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
# --- DROP ---
drop_approach = list(DROP_POSE)
drop_approach[2] = DROP_POSE[2] + DROP_APPROACH_Z
robot.set_cartesian_pose(drop_approach, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
robot.set_cartesian_pose(DROP_POSE, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
gripper.move(position=GRIP_OPEN_MM, force=GRIP_FORCE_N, asynchronous=False)
logger.success(f" Dropped detection {det_idx + 1} into box")
robot.set_cartesian_pose(drop_approach, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
# Return to scan pose for next object
robot.set_cartesian_pose(SCAN_POSE, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
picked_any = True
# Re-scan after each pick so detections stay fresh
break
if not picked_any:
logger.warning("No valid grasp found in this scan — stopping")
break
logger.success(f"Bin clearing complete after {iteration} iteration(s)")
finally:
try:
camera.disconnect()
except Exception as e:
logger.warning(f"Camera disconnect failed: {e}")
try:
gripper.disconnect()
except Exception as e:
logger.warning(f"Gripper disconnect failed: {e}")
try:
robot.disconnect()
except Exception as e:
logger.warning(f"Robot disconnect failed: {e}")
