Skip to content

Detect Cups from Multi Prompts

This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a computer vision pipeline that detects cups from a webcam feed using multiple prompts. Tzara generates code that captures an image from the webcam using OpenCV, runs a pre-trained object detection model with multiple prompts for "cup", and outputs bounding boxes and visualizations with Rerun.

Cup detected with bounding box and confidence score overlay

Cup detected with YOLOX — bounding box and confidence score

The Natural Language Instruction: Prompt 1

The natural language instruction for this task is:

capture an image with a webcam and detect the cup in the image

The Generated Code for Prompt 1

The generated code captures an image from the webcam, detects cups using an object detection model, and prints the bounding boxes and confidence scores for each detected cup.

python
# Pipeline: Capture image from webcam and detect cup using object detection
# Steps:
# 1. Import required modules
# 2. Initialize and connect to webcam
# 3. Capture a single frame
# 4. Run object detection using YOLOX to find the cup
# 5. Print/display the detection results

from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
import numpy as np

cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()

try:
    # Step 1: Capture a single color frame from the webcam
    frame = cam.capture_color_image()

    # Step 2: Detect objects using YOLOX (COCO includes 'cup')
    annotations, categories = retina.detect_objects_using_yolox(
        image=frame,
        score_threshold=0.25,
        nms_threshold=0.45,
    )

    # Step 3: Filter and print cup detections
    category_id_to_name = {}
    if hasattr(categories, 'categories') and categories.categories is not None:
        for cat in categories.categories:
            if isinstance(cat, dict):
                category_id_to_name[cat['id']] = cat['name']
            else:
                category_id_to_name[cat.id] = cat.name

    print("Detection results:")
    cup_detections = []
    if hasattr(annotations, 'annotations') and annotations.annotations is not None:
        for ann in annotations.annotations:
            if isinstance(ann, dict):
                cat_id = ann.get('category_id')
                cat_name = category_id_to_name.get(cat_id, "unknown")
                score = ann.get('score', None)
                bbox = ann.get('bbox', None)
            else:
                cat_id = ann.category_id
                cat_name = category_id_to_name.get(cat_id, "unknown")
                score = ann.score
                bbox = ann.bbox

            if cat_name.lower() == 'cup':
                cup_detections.append({'category': cat_name, 'score': score, 'bbox': bbox})
                print(f"  Cup detected! Score: {score:.2f}, BBox: {bbox}")

    if not cup_detections:
        print("  No cup detected in the image.")
    else:
        print(f"  Total cups detected: {len(cup_detections)}")
finally:
    cam.disconnect()

The Natural Language Instruction: Prompt 2

To add visualization of the detected cups, we can prompt Tzara with:

Could you add visualization using OpenCV

The Generated Code for Prompt 2

The generated code adds visualization of the detected cups using OpenCV to draw bounding boxes around them and display the image.

python
# Pipeline: Capture image from webcam, detect cup, draw bounding boxes, display result

import cv2
import numpy as np
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
from datatypes import datatypes

cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()

try:
    frame_rgb = cam.capture_color_image()

    annotations, categories = retina.detect_objects_using_yolox(
        image=frame_rgb,
        score_threshold=0.25,
        nms_threshold=0.45,
    )

    cat_id_to_name = {}
    if categories is not None:
        for cat in categories.categories:
            cat_id_to_name[cat["id"]] = cat["name"]

    frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
    cup_count = 0

    if annotations is not None:
        for ann in annotations.annotations:
            category_name = cat_id_to_name.get(ann["category_id"], "").lower()
            if category_name != "cup":
                continue
            cup_count += 1
            bbox = ann["bbox"]
            x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
            score = ann.get("score", None)
            cv2.rectangle(frame_bgr, (x, y), (x + w, y + h), (0, 255, 0), 2)
            label = f"cup: {score:.2f}" if score is not None else "cup"
            (text_w, text_h), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
            cv2.rectangle(frame_bgr, (x, y - text_h - baseline - 4), (x + text_w, y), (0, 255, 0), -1)
            cv2.putText(frame_bgr, label, (x, y - baseline - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2)

    print(f"Detected {cup_count} cup(s) in the image.")
    cv2.imshow("Cup Detection", frame_bgr)
    cv2.waitKey(0)
finally:
    cam.disconnect()
    cv2.destroyAllWindows()

Natural Language Instruction: Prompt 3

To save the visualization result, we can further prompt Tzara with:

Also save the image

The Generated Code for Prompt 3

The generated code saves the annotated image with detected cups to disk.

python
'''
STARTED FROM capture_detect_cup_and_visualize.py
Prompt: Also save the image
'''
# Pipeline: Capture image from webcam, detect cup, draw bounding boxes, display and save

import numpy as np
import cv2
from datatypes import datatypes
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina

# ── Step 1: Connect to the webcam ──────────────────────────────────────────────
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()

try:
    # ── Step 2: Capture a single color frame ───────────────────────────────────────
    frame_np = cam.capture_color_image()  # Returns RGB numpy array (H, W, 3)

    # ── Step 3: Detect objects using YOLOX ─────────────────────────────────────────
    # detect_objects_using_yolox returns (ObjectDetectionAnnotations, Categories)
    annotations, categories = retina.detect_objects_using_yolox(
        image=frame_np,
        score_threshold=0.25,
        nms_threshold=0.45,
    )

    # ── Step 4: Filter detections to only keep "cup" ───────────────────────────────
    # Build a mapping from category id -> category name using the Categories object
    cat_id_to_name = {}
    if categories is not None and hasattr(categories, "categories"):
        for cat in categories.categories:
            cat_id_to_name[cat["id"]] = cat["name"]

    # ── Step 5: Draw bounding boxes on the image ───────────────────────────────────
    # Work on a BGR copy for OpenCV drawing
    annotated_image = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)

    cup_color = (0, 255, 0)   # Green bounding box for cups
    text_color = (0, 255, 0)
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.7
    thickness = 2

    if annotations is not None and hasattr(annotations, "annotations"):
        for ann in annotations.annotations:
            category_id = ann.get("category_id", -1)
            category_name = cat_id_to_name.get(category_id, "unknown")

            # Only draw boxes for "cup"
            if category_name.lower() != "cup":
                continue

            # bbox is in XYWH format (COCO standard)
            bbox = ann.get("bbox", [0, 0, 0, 0])
            x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
            score = ann.get("score", None)

            # Draw rectangle
            cv2.rectangle(annotated_image, (x, y), (x + w, y + h), cup_color, thickness)

            # Build label string
            label = f"cup"
            if score is not None:
                label += f" {score:.2f}"

            # Draw label background and text
            (text_w, text_h), baseline = cv2.getTextSize(label, font, font_scale, thickness)
            cv2.rectangle(
                annotated_image,
                (x, y - text_h - baseline - 4),
                (x + text_w, y),
                cup_color,
                cv2.FILLED,
            )
            cv2.putText(
                annotated_image,
                label,
                (x, y - baseline - 2),
                font,
                font_scale,
                (0, 0, 0),  # Black text on green background
                thickness,
                cv2.LINE_AA,
            )

    # ── Step 6: Display the annotated image ────────────────────────────────────────
    cv2.imshow("Cup Detection", annotated_image)
    cv2.waitKey(0)

    # ── Step 7: Save the annotated image to disk ───────────────────────────────────
    output_path = "cup_detection_result.jpg"
    cv2.imwrite(output_path, annotated_image)
    print(f"Annotated image saved to: {output_path}")
finally:
    cam.disconnect()
    cv2.destroyAllWindows()