Skip to content

Detect Cups from Multi Prompts

This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a computer vision pipeline that detects cups from a webcam feed using multiple prompts. Tzara generates code that captures an image from the webcam using OpenCV, runs a pre-trained object detection model with multiple prompts for "cup", and outputs bounding boxes and visualizations with Rerun.

The Natural Language Instruction: Prompt 1

The natural language instruction for this task is:

capture an image with a webcam and detect the cup in the image

The Generated Code for Prompt 1

The generated code captures an image from the webcam, detects cups using an object detection model, and prints the bounding boxes and confidence scores for each detected cup.

python
# Pipeline: Capture image from webcam and detect cup using object detection
# Steps:
# 1. Import required modules
# 2. Initialize and connect to webcam
# 3. Capture a single frame
# 4. Run object detection using YOLOX to find the cup
# 5. Print/display the detection results

from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
import numpy as np

# Step 1: Initialize the webcam with a unique name and default camera ID (0)
cam = Webcam(name="webcam_0", camera_id=0)

# Step 2: Connect to the webcam
cam.connect()

# Step 3: Capture a single color frame from the webcam
frame = cam.capture_single_color_frame()

# Step 4: Detect objects in the captured frame using YOLOX
# YOLOX is trained on COCO which includes 'cup' as a category
annotations, categories = retina.detect_objects_using_yolox(
    image=frame,
    score_threshold=0.25,
    nms_threshold=0.45,
)

# Step 5: Filter and print detections that correspond to 'cup'
# Build a mapping from category id to category name
category_id_to_name = {}
if hasattr(categories, 'categories') and categories.categories is not None:
    for cat in categories.categories:
        if isinstance(cat, dict):
            category_id_to_name[cat['id']] = cat['name']
        else:
            category_id_to_name[cat.id] = cat.name

print("Detection results:")
cup_detections = []
if hasattr(annotations, 'annotations') and annotations.annotations is not None:
    for ann in annotations.annotations:
        if isinstance(ann, dict):
            cat_id = ann.get('category_id')
            cat_name = category_id_to_name.get(cat_id, "unknown")
            score = ann.get('score', None)
            bbox = ann.get('bbox', None)
        else:
            cat_id = ann.category_id
            cat_name = category_id_to_name.get(cat_id, "unknown")
            score = ann.score
            bbox = ann.bbox

        if cat_name.lower() == 'cup':
            cup_detections.append({'category': cat_name, 'score': score, 'bbox': bbox})
            print(f"  Cup detected! Score: {score:.2f}, BBox: {bbox}")

if not cup_detections:
    print("  No cup detected in the image.")
else:
    print(f"  Total cups detected: {len(cup_detections)}")

# Step 6: Disconnect the webcam after use
cam.disconnect()

The Natural Language Instruction: Prompt 2

To add visualization of the detected cups, we can prompt Tzara with:

could you add visualization

The Generated Code for Prompt 2

The generated code adds visualization of the detected cups using OpenCV to draw bounding boxes around them and display the image.

python
# Pipeline: Capture image from webcam, detect cup, draw bounding boxes, display result

# Step 1: Import required modules
import cv2
import numpy as np
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
from datatypes import datatypes

# Step 2: Initialize and connect the webcam
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()

# Step 3: Capture a single color frame from the webcam
# Returns an RGB numpy array
frame_rgb = cam.capture_single_color_frame()

# Step 4: Disconnect the webcam after capture
cam.disconnect()

# Step 5: Detect objects using YOLOX (returns COCO-style annotations and categories)
annotations, categories = retina.detect_objects_using_yolox(
    image=frame_rgb,
    score_threshold=0.25,
    nms_threshold=0.45,
)

# Step 6: Filter detections to only keep "cup" category
# Build a mapping from category id to category name
cat_id_to_name = {}
if categories is not None:
    for cat in categories.categories:
        cat_id_to_name[cat["id"]] = cat["name"]

# Step 7: Draw bounding boxes on the image for cup detections
# Convert RGB frame to BGR for OpenCV display
frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)

cup_count = 0
if annotations is not None:
    for ann in annotations.annotations:
        category_id = ann["category_id"]
        category_name = cat_id_to_name.get(category_id, "").lower()

        # Only draw boxes for "cup" detections
        if category_name == "cup":
            cup_count += 1
            # bbox is in XYWH format [x, y, w, h]
            bbox = ann["bbox"]
            x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
            score = ann.get("score", None)

            # Draw bounding box rectangle (green color)
            cv2.rectangle(frame_bgr, (x, y), (x + w, y + h), (0, 255, 0), 2)

            # Prepare label text with score if available
            if score is not None:
                label = f"cup: {score:.2f}"
            else:
                label = "cup"

            # Draw label background
            (text_w, text_h), baseline = cv2.getTextSize(
                label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2
            )
            cv2.rectangle(
                frame_bgr,
                (x, y - text_h - baseline - 4),
                (x + text_w, y),
                (0, 255, 0),
                -1,
            )

            # Draw label text
            cv2.putText(
                frame_bgr,
                label,
                (x, y - baseline - 2),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,
                (0, 0, 0),
                2,
            )

print(f"Detected {cup_count} cup(s) in the image.")

# Step 8: Display the annotated image
cv2.imshow("Cup Detection", frame_bgr)
cv2.waitKey(0)
cv2.destroyAllWindows()

Natural Language Instruction: Prompt 3

To save the visualization result, we can further prompt Tzara with:

Also save the image

The Generated Code for Prompt 3

The generated code saves the annotated image with detected cups to disk.

python
'''
STARTED FROM capture_detect_cup_and_visualize.py
Prompt: Also save the image
'''
# Pipeline: Capture image from webcam, detect cup, draw bounding boxes, display and save

import numpy as np
import cv2
from datatypes import datatypes
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina

# ── Step 1: Connect to the webcam ──────────────────────────────────────────────
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()

# ── Step 2: Capture a single color frame ───────────────────────────────────────
frame_np = cam.capture_single_color_frame()  # Returns RGB numpy array (H, W, 3)

# ── Step 3: Disconnect the webcam ──────────────────────────────────────────────
cam.disconnect()

# ── Step 4: Detect objects using YOLOX ─────────────────────────────────────────
# detect_objects_using_yolox returns (ObjectDetectionAnnotations, Categories)
annotations, categories = retina.detect_objects_using_yolox(
    image=frame_np,
    score_threshold=0.25,
    nms_threshold=0.45,
)

# ── Step 5: Filter detections to only keep "cup" ───────────────────────────────
# Build a mapping from category id -> category name using the Categories object
cat_id_to_name = {}
if categories is not None and hasattr(categories, "categories"):
    for cat in categories.categories:
        cat_id_to_name[cat["id"]] = cat["name"]

# ── Step 6: Draw bounding boxes on the image ───────────────────────────────────
# Work on a BGR copy for OpenCV drawing
annotated_image = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)

cup_color = (0, 255, 0)   # Green bounding box for cups
text_color = (0, 255, 0)
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
thickness = 2

if annotations is not None and hasattr(annotations, "annotations"):
    for ann in annotations.annotations:
        category_id = ann.get("category_id", -1)
        category_name = cat_id_to_name.get(category_id, "unknown")

        # Only draw boxes for "cup"
        if category_name.lower() != "cup":
            continue

        # bbox is in XYWH format (COCO standard)
        bbox = ann.get("bbox", [0, 0, 0, 0])
        x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
        score = ann.get("score", None)

        # Draw rectangle
        cv2.rectangle(annotated_image, (x, y), (x + w, y + h), cup_color, thickness)

        # Build label string
        label = f"cup"
        if score is not None:
            label += f" {score:.2f}"

        # Draw label background and text
        (text_w, text_h), baseline = cv2.getTextSize(label, font, font_scale, thickness)
        cv2.rectangle(
            annotated_image,
            (x, y - text_h - baseline - 4),
            (x + text_w, y),
            cup_color,
            cv2.FILLED,
        )
        cv2.putText(
            annotated_image,
            label,
            (x, y - baseline - 2),
            font,
            font_scale,
            (0, 0, 0),  # Black text on green background
            thickness,
            cv2.LINE_AA,
        )

# ── Step 7: Display the annotated image ────────────────────────────────────────
cv2.imshow("Cup Detection", annotated_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

# ── Step 8: Save the annotated image to disk ───────────────────────────────────
output_path = "cup_detection_result.jpg"
cv2.imwrite(output_path, annotated_image)
print(f"Annotated image saved to: {output_path}")