Detect Cups from Multi Prompts
This example demonstrates how to use Tzara, the Telekinesis Physical AI Agent, to build a computer vision pipeline that detects cups from a webcam feed using multiple prompts. Tzara generates code that captures an image from the webcam using OpenCV, runs a pre-trained object detection model with multiple prompts for "cup", and outputs bounding boxes and visualizations with Rerun.
The Natural Language Instruction: Prompt 1
The natural language instruction for this task is:
capture an image with a webcam and detect the cup in the imageThe Generated Code for Prompt 1
The generated code captures an image from the webcam, detects cups using an object detection model, and prints the bounding boxes and confidence scores for each detected cup.
# Pipeline: Capture image from webcam and detect cup using object detection
# Steps:
# 1. Import required modules
# 2. Initialize and connect to webcam
# 3. Capture a single frame
# 4. Run object detection using YOLOX to find the cup
# 5. Print/display the detection results
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
import numpy as np
# Step 1: Initialize the webcam with a unique name and default camera ID (0)
cam = Webcam(name="webcam_0", camera_id=0)
# Step 2: Connect to the webcam
cam.connect()
# Step 3: Capture a single color frame from the webcam
frame = cam.capture_single_color_frame()
# Step 4: Detect objects in the captured frame using YOLOX
# YOLOX is trained on COCO which includes 'cup' as a category
annotations, categories = retina.detect_objects_using_yolox(
image=frame,
score_threshold=0.25,
nms_threshold=0.45,
)
# Step 5: Filter and print detections that correspond to 'cup'
# Build a mapping from category id to category name
category_id_to_name = {}
if hasattr(categories, 'categories') and categories.categories is not None:
for cat in categories.categories:
if isinstance(cat, dict):
category_id_to_name[cat['id']] = cat['name']
else:
category_id_to_name[cat.id] = cat.name
print("Detection results:")
cup_detections = []
if hasattr(annotations, 'annotations') and annotations.annotations is not None:
for ann in annotations.annotations:
if isinstance(ann, dict):
cat_id = ann.get('category_id')
cat_name = category_id_to_name.get(cat_id, "unknown")
score = ann.get('score', None)
bbox = ann.get('bbox', None)
else:
cat_id = ann.category_id
cat_name = category_id_to_name.get(cat_id, "unknown")
score = ann.score
bbox = ann.bbox
if cat_name.lower() == 'cup':
cup_detections.append({'category': cat_name, 'score': score, 'bbox': bbox})
print(f" Cup detected! Score: {score:.2f}, BBox: {bbox}")
if not cup_detections:
print(" No cup detected in the image.")
else:
print(f" Total cups detected: {len(cup_detections)}")
# Step 6: Disconnect the webcam after use
cam.disconnect()The Natural Language Instruction: Prompt 2
To add visualization of the detected cups, we can prompt Tzara with:
could you add visualizationThe Generated Code for Prompt 2
The generated code adds visualization of the detected cups using OpenCV to draw bounding boxes around them and display the image.
# Pipeline: Capture image from webcam, detect cup, draw bounding boxes, display result
# Step 1: Import required modules
import cv2
import numpy as np
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
from datatypes import datatypes
# Step 2: Initialize and connect the webcam
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()
# Step 3: Capture a single color frame from the webcam
# Returns an RGB numpy array
frame_rgb = cam.capture_single_color_frame()
# Step 4: Disconnect the webcam after capture
cam.disconnect()
# Step 5: Detect objects using YOLOX (returns COCO-style annotations and categories)
annotations, categories = retina.detect_objects_using_yolox(
image=frame_rgb,
score_threshold=0.25,
nms_threshold=0.45,
)
# Step 6: Filter detections to only keep "cup" category
# Build a mapping from category id to category name
cat_id_to_name = {}
if categories is not None:
for cat in categories.categories:
cat_id_to_name[cat["id"]] = cat["name"]
# Step 7: Draw bounding boxes on the image for cup detections
# Convert RGB frame to BGR for OpenCV display
frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
cup_count = 0
if annotations is not None:
for ann in annotations.annotations:
category_id = ann["category_id"]
category_name = cat_id_to_name.get(category_id, "").lower()
# Only draw boxes for "cup" detections
if category_name == "cup":
cup_count += 1
# bbox is in XYWH format [x, y, w, h]
bbox = ann["bbox"]
x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
score = ann.get("score", None)
# Draw bounding box rectangle (green color)
cv2.rectangle(frame_bgr, (x, y), (x + w, y + h), (0, 255, 0), 2)
# Prepare label text with score if available
if score is not None:
label = f"cup: {score:.2f}"
else:
label = "cup"
# Draw label background
(text_w, text_h), baseline = cv2.getTextSize(
label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2
)
cv2.rectangle(
frame_bgr,
(x, y - text_h - baseline - 4),
(x + text_w, y),
(0, 255, 0),
-1,
)
# Draw label text
cv2.putText(
frame_bgr,
label,
(x, y - baseline - 2),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(0, 0, 0),
2,
)
print(f"Detected {cup_count} cup(s) in the image.")
# Step 8: Display the annotated image
cv2.imshow("Cup Detection", frame_bgr)
cv2.waitKey(0)
cv2.destroyAllWindows()Natural Language Instruction: Prompt 3
To save the visualization result, we can further prompt Tzara with:
Also save the imageThe Generated Code for Prompt 3
The generated code saves the annotated image with detected cups to disk.
'''
STARTED FROM capture_detect_cup_and_visualize.py
Prompt: Also save the image
'''
# Pipeline: Capture image from webcam, detect cup, draw bounding boxes, display and save
import numpy as np
import cv2
from datatypes import datatypes
from telekinesis.medulla.cameras.webcam import Webcam
from telekinesis import retina
# ── Step 1: Connect to the webcam ──────────────────────────────────────────────
cam = Webcam(name="webcam_0", camera_id=0)
cam.connect()
# ── Step 2: Capture a single color frame ───────────────────────────────────────
frame_np = cam.capture_single_color_frame() # Returns RGB numpy array (H, W, 3)
# ── Step 3: Disconnect the webcam ──────────────────────────────────────────────
cam.disconnect()
# ── Step 4: Detect objects using YOLOX ─────────────────────────────────────────
# detect_objects_using_yolox returns (ObjectDetectionAnnotations, Categories)
annotations, categories = retina.detect_objects_using_yolox(
image=frame_np,
score_threshold=0.25,
nms_threshold=0.45,
)
# ── Step 5: Filter detections to only keep "cup" ───────────────────────────────
# Build a mapping from category id -> category name using the Categories object
cat_id_to_name = {}
if categories is not None and hasattr(categories, "categories"):
for cat in categories.categories:
cat_id_to_name[cat["id"]] = cat["name"]
# ── Step 6: Draw bounding boxes on the image ───────────────────────────────────
# Work on a BGR copy for OpenCV drawing
annotated_image = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
cup_color = (0, 255, 0) # Green bounding box for cups
text_color = (0, 255, 0)
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
thickness = 2
if annotations is not None and hasattr(annotations, "annotations"):
for ann in annotations.annotations:
category_id = ann.get("category_id", -1)
category_name = cat_id_to_name.get(category_id, "unknown")
# Only draw boxes for "cup"
if category_name.lower() != "cup":
continue
# bbox is in XYWH format (COCO standard)
bbox = ann.get("bbox", [0, 0, 0, 0])
x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
score = ann.get("score", None)
# Draw rectangle
cv2.rectangle(annotated_image, (x, y), (x + w, y + h), cup_color, thickness)
# Build label string
label = f"cup"
if score is not None:
label += f" {score:.2f}"
# Draw label background and text
(text_w, text_h), baseline = cv2.getTextSize(label, font, font_scale, thickness)
cv2.rectangle(
annotated_image,
(x, y - text_h - baseline - 4),
(x + text_w, y),
cup_color,
cv2.FILLED,
)
cv2.putText(
annotated_image,
label,
(x, y - baseline - 2),
font,
font_scale,
(0, 0, 0), # Black text on green background
thickness,
cv2.LINE_AA,
)
# ── Step 7: Display the annotated image ────────────────────────────────────────
cv2.imshow("Cup Detection", annotated_image)
cv2.waitKey(0)
cv2.destroyAllWindows()
# ── Step 8: Save the annotated image to disk ───────────────────────────────────
output_path = "cup_detection_result.jpg"
cv2.imwrite(output_path, annotated_image)
print(f"Annotated image saved to: {output_path}")
