Skip to content

Mixed Bin Clearing

This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a vision-guided bin clearing pipeline. Tzara generates code that moves to a scan pose, detects all objects in a bin using QWEN, segments each detection with SAM, and computes grasp orientation via PCA on the binary mask in image space. The 3D grasp position is back-projected from the depth image using camera intrinsics, and each object is picked at a fixed grasp height before being dropped into a target box.

Demo

UR10e picking mixed objects from a bin and dropping into a box

The Natural Language Instruction

Pick different objects from the blue bin and drop them into box. Scan pose is [-0.25462, 0.59302, 0.3141, 180.0, 0.0, 90.0], drop pose is [-0.740, 0.727, -0.2, -180.0, 0.0, 90.0]. Use Sam after qwen get the mask and do PCA on that in image space. Make the grasp height GRASP_Z = 0.015.

The Generated Code

python
# Telekinesis pipeline: UR10e + OnRobot RG6 mixed bin clearing
# Detects objects with QWEN VLM, segments with SAM, computes grasp orientation
# via PCA on the binary mask in image space, back-projects centroid to 3D.

import math
import numpy as np
import cv2
from loguru import logger

from telekinesis.synapse.robots.manipulators.universal_robots import UniversalRobotsUR10E
from telekinesis.synapse.tools.parallel_grippers.onrobot import OnRobotRG6
from telekinesis.medulla.cameras.realsense import RealSense
from telekinesis import retina

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

ROBOT_IP   = "192.168.1.2"
GRIPPER_IP = "192.168.1.1"

SCAN_POSE = [-0.25462, 0.59302, 0.3141, 180.0, 0.0, 90.0]
DROP_POSE = [-0.740,   0.727,  -0.2,  -180.0, 0.0, 90.0]

GRASP_Z         = 0.015    # fixed grasp Z height (m)
APPROACH_Z      = 0.200    # approach/retreat Z height above grasp (m)
DROP_APPROACH_Z = 0.100    # approach height above drop pose (m)

# Gripper
GRIP_OPEN_MM    = 110.0
GRIP_CLOSE_MM   = 0.0
GRIP_FORCE_N    = 40.0

# Motion
MOVE_SPEED = 0.3
MOVE_ACCEL = 0.5

MAX_ITERATIONS = 20   # safety guard — stop after this many pick cycles

# QWEN detection prompt
DETECTION_PROMPT = "objects in the bin"

# ---------------------------------------------------------------------------
# Connect hardware
# ---------------------------------------------------------------------------

robot = UniversalRobotsUR10E()
robot.connect(ROBOT_IP)
logger.success("Robot connected")

gripper = OnRobotRG6()
gripper.connect(ip=GRIPPER_IP, protocol="MODBUS_TCP")
gripper.set_unit("position", "mm")
gripper.set_unit("force", "N")
logger.success("Gripper connected")

camera = RealSense()
camera.connect()
logger.success("Camera connected")

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def above(pose: list, dz: float) -> list:
    p = list(pose)
    p[2] += dz
    return p


def pca_angle_from_mask(mask: np.ndarray) -> float:
    """Return the major-axis angle (radians) of the binary mask via image-space PCA."""
    ys, xs = np.where(mask > 0)
    if len(xs) < 10:
        return 0.0
    coords = np.stack([xs, ys], axis=1).astype(np.float64)
    mean = coords.mean(axis=0)
    centered = coords - mean
    cov = np.cov(centered.T)
    eigenvalues, eigenvectors = np.linalg.eigh(cov)
    major_axis = eigenvectors[:, np.argmax(eigenvalues)]
    return math.atan2(major_axis[1], major_axis[0])


def backproject_centroid(mask: np.ndarray, depth_image: np.ndarray, intrinsics) -> tuple:
    """Back-project the mask centroid pixel to 3D camera-frame XY using the depth image."""
    ys, xs = np.where(mask > 0)
    if len(xs) == 0:
        return None
    cx_px = int(np.mean(xs))
    cy_px = int(np.mean(ys))
    depth_m = depth_image[cy_px, cx_px] * 0.001   # RealSense depth in mm -> m
    if depth_m <= 0.0:
        # fallback: median over mask pixels
        depths = depth_image[ys, xs].astype(np.float64) * 0.001
        depths = depths[depths > 0]
        if len(depths) == 0:
            return None
        depth_m = float(np.median(depths))

    x_3d = (cx_px - intrinsics.ppx) * depth_m / intrinsics.fx
    y_3d = (cy_px - intrinsics.ppy) * depth_m / intrinsics.fy
    return x_3d, y_3d


# ---------------------------------------------------------------------------
# Pipeline
# ---------------------------------------------------------------------------

try:
    # Start at home with gripper open
    gripper.move(position=GRIP_OPEN_MM, force=GRIP_FORCE_N, asynchronous=False)
    robot.set_cartesian_pose(SCAN_POSE, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)

    intrinsics = camera.get_intrinsics()
    iteration  = 0

    while iteration < MAX_ITERATIONS:
        iteration += 1
        logger.info(f"=== Scan iteration {iteration} ===")

        # Capture
        color_image = camera.get_color_frame()
        depth_image = camera.get_depth_frame()
        logger.info("Captured color + depth frames")

        # Detect objects with QWEN
        detections = retina.qwen(color_image, prompt=DETECTION_PROMPT)
        logger.info(f"QWEN detected {len(detections)} object(s)")

        if len(detections) == 0:
            logger.success("No more objects detected — bin is clear")
            break

        picked_any = False

        for det_idx, detection in enumerate(detections):
            logger.info(f"  Processing detection {det_idx + 1}/{len(detections)}: {detection.get('label', 'object')}")

            # Segment with SAM using QWEN bounding box as prompt
            bbox = detection.get("bbox")   # [x1, y1, x2, y2] in pixels
            mask = retina.segment_anything(color_image, bbox_prompt=bbox)
            if mask is None or mask.sum() == 0:
                logger.warning(f"  SAM returned empty mask for detection {det_idx + 1}, skipping")
                continue

            # PCA in image space -> grasp rotation around Z
            angle_rad   = pca_angle_from_mask(mask)
            angle_deg   = math.degrees(angle_rad)
            grasp_rz    = SCAN_POSE[5] + angle_deg   # adjust base Z rotation by PCA angle
            logger.debug(f"  PCA angle: {angle_deg:.1f} deg -> grasp rz = {grasp_rz:.1f} deg")

            # Back-project centroid to robot XY
            result = backproject_centroid(mask, depth_image, intrinsics)
            if result is None:
                logger.warning(f"  Could not back-project centroid for detection {det_idx + 1}, skipping")
                continue
            x_cam, y_cam = result

            # Transform camera-frame XY to robot-frame using scan pose as reference
            # (assumes camera mounted at TCP pointing down; robot XY ~ camera XY with sign flip)
            robot_x = SCAN_POSE[0] - x_cam
            robot_y = SCAN_POSE[1] - y_cam

            grasp_pose   = [robot_x, robot_y, GRASP_Z,    SCAN_POSE[3], SCAN_POSE[4], grasp_rz]
            approach_pose = above(grasp_pose, APPROACH_Z - GRASP_Z)

            logger.info(f"  Grasp pose: {[round(v, 4) for v in grasp_pose]}")

            # --- PICK ---
            robot.set_cartesian_pose(approach_pose, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
            gripper.move(position=GRIP_OPEN_MM, force=GRIP_FORCE_N, asynchronous=False)
            robot.set_cartesian_pose(grasp_pose, speed=MOVE_SPEED * 0.5, acceleration=MOVE_ACCEL)
            gripper.move(position=GRIP_CLOSE_MM, force=GRIP_FORCE_N, asynchronous=False)
            logger.success(f"  Grasped detection {det_idx + 1}")

            robot.set_cartesian_pose(approach_pose, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)

            # --- DROP ---
            drop_approach = list(DROP_POSE)
            drop_approach[2] = DROP_POSE[2] + DROP_APPROACH_Z
            robot.set_cartesian_pose(drop_approach, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
            robot.set_cartesian_pose(DROP_POSE, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
            gripper.move(position=GRIP_OPEN_MM, force=GRIP_FORCE_N, asynchronous=False)
            logger.success(f"  Dropped detection {det_idx + 1} into box")

            robot.set_cartesian_pose(drop_approach, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)

            # Return to scan pose for next object
            robot.set_cartesian_pose(SCAN_POSE, speed=MOVE_SPEED, acceleration=MOVE_ACCEL)
            picked_any = True

            # Re-scan after each pick so detections stay fresh
            break

        if not picked_any:
            logger.warning("No valid grasp found in this scan — stopping")
            break

    logger.success(f"Bin clearing complete after {iteration} iteration(s)")

finally:
    try:
        camera.disconnect()
    except Exception as e:
        logger.warning(f"Camera disconnect failed: {e}")
    try:
        gripper.disconnect()
    except Exception as e:
        logger.warning(f"Gripper disconnect failed: {e}")
    try:
        robot.disconnect()
    except Exception as e:
        logger.warning(f"Robot disconnect failed: {e}")